Skip to content

Commit 2f74099

Browse files
authored
regex.pcre: small fixes (#27341)
1 parent 62ec38a commit 2f74099

3 files changed

Lines changed: 125 additions & 7 deletions

File tree

vlib/regex/pcre/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,13 @@ Supports backreferences like `$1`, `$2`.
100100
fn (r Regex) replace(text string, repl string) string
101101
```
102102

103-
### `change_stack_depth`
104-
Updates the maximum backtracking depth for the VM.
105-
Default is 1024.
106-
Use this if your pattern is extremely complex and returns `none` prematurely.
103+
### `max_stack_depth` (configuration field)
104+
Controls the maximum backtracking depth for the VM.
105+
Default is `2048`. Increase this value if complex patterns return `none` prematurely due to
106+
deep backtracking; decrease it to limit memory usage.
107107
```v ignore
108-
fn (mut r Regex) change_stack_depth(depth int)
108+
r := pcre.compile(pattern)!
109+
r.max_stack_depth = 4096
109110
```
110111

111112
---

vlib/regex/pcre/regex.v

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ Key Architectural Features and Optimizations:
5757

5858
module pcre
5959

60+
import strconv
6061
import strings
6162

6263
/******************************************************************************
@@ -664,6 +665,9 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
664665
return error('Unclosed named group')
665666
}
666667
name := pattern[pos..end]
668+
if name in group_map {
669+
return error('Duplicate named group: ${name}')
670+
}
667671
idx = group_counter
668672
group_map[name] = idx
669673
pos = end + 1
@@ -774,6 +778,38 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
774778
typ: .uppercase_char
775779
}
776780
}
781+
`x` {
782+
// \xHH - two hex digits decode to a character
783+
if pos + 2 > pattern.len {
784+
return error('\\x requires exactly 2 hex digits')
785+
}
786+
hex_str := pattern[pos..pos + 2]
787+
val := strconv.parse_uint(hex_str, 16, 32) or {
788+
return error('Invalid hex escape \\x${hex_str}')
789+
}
790+
pos += 2
791+
parsed_nodes << Node{
792+
typ: .chr
793+
chr: rune(val)
794+
ignore_case: current_flags.ignore_case
795+
}
796+
}
797+
`X` {
798+
// \XHHHH - four hex digits decode to a Unicode codepoint
799+
if pos + 4 > pattern.len {
800+
return error('\\X requires exactly 4 hex digits')
801+
}
802+
hex_str := pattern[pos..pos + 4]
803+
val := strconv.parse_uint(hex_str, 16, 32) or {
804+
return error('Invalid hex escape \\X${hex_str}')
805+
}
806+
pos += 4
807+
parsed_nodes << Node{
808+
typ: .chr
809+
chr: rune(val)
810+
ignore_case: current_flags.ignore_case
811+
}
812+
}
777813
else {
778814
parsed_nodes << Node{
779815
typ: .chr
@@ -822,6 +858,9 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
822858
} else {
823859
min
824860
}
861+
if min < 0 || (max != -1 && max < min) {
862+
return error('Invalid quantifier range {${min},${max}}')
863+
}
825864
q = Quantifier{min, max, true}
826865
pos = end + 1
827866
}
@@ -1027,7 +1066,7 @@ fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match {
10271066
.split {
10281067
if stack_ptr + frame_size >= stack_max {
10291068
new_size := stack_max * 2
1030-
if new_size > 1_000_000 {
1069+
if new_size > r.max_stack_depth {
10311070
goto backtrack
10321071
}
10331072
m.stack.grow_len(new_size)
@@ -1183,7 +1222,13 @@ pub fn (r &Regex) find_all(text string) []Match {
11831222
}
11841223
if res := r.vm_match(text, i, mut m) {
11851224
matches << res
1186-
i = if res.end > i { res.end } else { i + 1 }
1225+
if res.end > i {
1226+
i = res.end
1227+
} else {
1228+
// Empty match: advance by one full rune to avoid infinite loop
1229+
_, rune_len := read_rune_at(text.str, text.len, i)
1230+
i += if rune_len > 0 { rune_len } else { 1 }
1231+
}
11871232
} else {
11881233
i++
11891234
}

vlib/regex/pcre/regex_test.v

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,3 +635,75 @@ fn test_compatibility_layer() {
635635
assert false, 'match_str should return none when no match is found'
636636
}
637637
}
638+
639+
fn test_hex_escapes() {
640+
// \xHH — two hex digits
641+
tst_find(r'\x41', 'ABC', 'A') // 0x41 = 'A'
642+
tst_find(r'\x61', 'abc', 'a') // 0x61 = 'a'
643+
tst_find(r'\x41+', 'AAAB', 'AAA')
644+
tst_find(r'\x20\x41', ' A test', ' A') // space + 'A'
645+
646+
// \XHHHH — four hex digits (Unicode codepoint)
647+
tst_find(r'\X0041', 'ABC', 'A') // U+0041 = 'A'
648+
tst_find(r'\X0061', 'abc', 'a') // U+0061 = 'a'
649+
tst_find(r'\X03B1', 'αβγ', 'α') // U+03B1 = 'α'
650+
651+
// Mix with other escapes
652+
tst_find(r'\x48\x65\x6C\x6C\x6F', 'Hello World', 'Hello') // \x48\x65\x6C\x6C\x6F = "Hello"
653+
654+
// Invalid hex escape compile errors
655+
tst_compile_error(r'\x4') // only 1 digit
656+
tst_compile_error(r'\xGG') // invalid hex chars
657+
tst_compile_error(r'\X004') // only 3 digits
658+
}
659+
660+
fn test_duplicate_named_groups() {
661+
// Compile error: same name used twice
662+
tst_compile_error(r'(?P<id>\d+)-(?P<id>\w+)')
663+
// Different names are fine
664+
r := pcre.compile(r'(?P<a>\d+)-(?P<b>\w+)') or {
665+
assert false, 'Should compile: ${err}'
666+
return
667+
}
668+
m := r.find('12-abc') or {
669+
assert false, 'Should match'
670+
return
671+
}
672+
assert r.group_by_name(m, 'a') == '12'
673+
assert r.group_by_name(m, 'b') == 'abc'
674+
}
675+
676+
fn test_invalid_quantifier_ranges() {
677+
// min > max is an error
678+
tst_compile_error(r'a{3,1}')
679+
tst_compile_error(r'a{5,2}')
680+
// negative min-like patterns (parsed as 0)
681+
// {0,0} should compile and match empty string
682+
r := pcre.compile(r'a{0,0}b') or {
683+
assert false, 'Should compile: ${err}'
684+
return
685+
}
686+
m := r.find('b') or {
687+
assert false, 'Should match'
688+
return
689+
}
690+
assert m.text == 'b'
691+
}
692+
693+
fn test_find_all_utf8_safety() {
694+
// find_all with an empty-matching pattern must not get stuck inside a multi-byte rune
695+
r := pcre.compile(r'x*') or { panic(err) }
696+
matches := r.find_all('aé') // 'é' is 2 bytes (0xC3 0xA9)
697+
// Every result start/end must align on a rune boundary
698+
for m in matches {
699+
text_bytes := 'aé'.bytes()
700+
if m.start < text_bytes.len {
701+
// byte at start must not be a UTF-8 continuation byte
702+
assert (text_bytes[m.start] & 0xC0) != 0x80, 'Misaligned match start at ${m.start}'
703+
}
704+
}
705+
// find_all should not infinite-loop on emoji
706+
r2 := pcre.compile(r'y*') or { panic(err) }
707+
matches2 := r2.find_all('😀!')
708+
assert matches2.len > 0
709+
}

0 commit comments

Comments
 (0)