Skip to content

Commit 143ff2f

Browse files
committed
Add RegexSet functionality to C API
These functions implement a C interface to the RegexSet api. Some notes: * These do not include start offsets as the standard regex functions do. The reason being is down to how these are implemented in the core regex crate. The RegexSet api does not expose a public is_match_at whilst the Regex api does. * This only tests a complete compile/match mainly for sanity. One or two more tests targetting the specific areas would be preferred. * Set matches take a mutuable array to fill with results. This is more C-like and allows the caller to manage the memory on the stack if they want.
1 parent 3cfef1e commit 143ff2f

File tree

3 files changed

+223
-2
lines changed

3 files changed

+223
-2
lines changed

regex-capi/ctest/test.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,64 @@ bool test_compile_error_size_limit() {
331331
return passed;
332332
}
333333

334+
bool test_regex_set_matches() {
335+
336+
#define PAT_COUNT 6
337+
338+
bool passed = true;
339+
const char *patterns[] = {
340+
"foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"
341+
};
342+
const size_t patterns_lengths[] = {
343+
3, 6, 3, 3, 6, 3
344+
};
345+
346+
rure_error *err = rure_error_new();
347+
rure_set *re = rure_compile_set((const uint8_t **) patterns,
348+
patterns_lengths,
349+
PAT_COUNT, err);
350+
if (re == NULL) {
351+
passed = false;
352+
goto done2;
353+
}
354+
355+
if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6)) {
356+
passed = false;
357+
goto done1;
358+
}
359+
360+
if (rure_set_is_match(re, (const uint8_t *) "", 0)) {
361+
passed = false;
362+
goto done1;
363+
}
364+
365+
bool matches[PAT_COUNT];
366+
if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, matches)) {
367+
passed = false;
368+
goto done1;
369+
}
370+
371+
const bool match_target[] = {
372+
true, false, true, false, true, true
373+
};
374+
375+
int i;
376+
for (i = 0; i < PAT_COUNT; ++i) {
377+
if (matches[i] != match_target[i]) {
378+
passed = false;
379+
goto done1;
380+
}
381+
}
382+
383+
done1:
384+
rure_set_free(re);
385+
done2:
386+
rure_error_free(err);
387+
return passed;
388+
389+
#undef PAT_COUNT
390+
}
391+
334392
void run_test(bool (test)(), const char *name, bool *passed) {
335393
if (!test()) {
336394
*passed = false;
@@ -353,6 +411,7 @@ int main() {
353411
run_test(test_compile_error, "test_compile_error", &passed);
354412
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
355413
&passed);
414+
run_test(test_regex_set_matches, "test_regex_set_match", &passed);
356415

357416
if (!passed) {
358417
exit(1);

regex-capi/include/rure.h

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ extern "C" {
2828
*/
2929
typedef struct rure rure;
3030

31+
/*
32+
* rure_set is the type of a set of compiled regular expressions.
33+
*
34+
* A rure can be safely used from multiple threads simultaneously.
35+
*/
36+
typedef struct rure_set rure_set;
37+
3138
/*
3239
* rure_options is the set of non-flag configuration options for compiling
3340
* a regular expression. Currently, only two options are available: setting
@@ -165,7 +172,7 @@ rure *rure_compile(const uint8_t *pattern, size_t length,
165172
/*
166173
* rure_free frees the given compiled regular expression.
167174
*
168-
* This must be called at most once.
175+
* This must be called at most once for any rure.
169176
*/
170177
void rure_free(rure *re);
171178

@@ -446,6 +453,60 @@ void rure_options_size_limit(rure_options *options, size_t limit);
446453
*/
447454
void rure_options_dfa_size_limit(rure_options *options, size_t limit);
448455

456+
/*
457+
* rure_compile_set compiles the given list of patterns into a single regular
458+
* expression which can be matched in a linear-scan. Each pattern in patterns
459+
* must be valid UTF-8 and the length of each pattern in patterns corresponds
460+
* to a byte length in patterns_lengths.
461+
*
462+
* The number of patterns to compile is specified by patterns_count. patterns
463+
* must contain at least this many entries.
464+
*
465+
* error is set if there was a problem compiling the pattern.
466+
*
467+
* The compiled expression set returned may be used from multiple threads.
468+
*/
469+
rure_set *rure_compile_set(const uint8_t **patterns,
470+
const size_t *patterns_lengths,
471+
size_t patterns_count, rure_error *error);
472+
473+
/*
474+
* rure_set_free frees the given compiled regular expression set.
475+
*
476+
* This must be called at most once.
477+
*/
478+
void rure_set_free(rure_set *re);
479+
480+
/*
481+
* rure_is_match returns true if and only if any regexes within the set
482+
* match anywhere in the haystack. Once a match has been located, the
483+
* matching engine will quit immediately.
484+
*
485+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
486+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
487+
* length should be the number of bytes in haystack.
488+
*/
489+
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length);
490+
491+
/*
492+
* rure_set_matches compares each regex in the set against the haystack and
493+
* returns an array of bools which correspond to if a match was found for
494+
* the specified regex.
495+
*
496+
* haystack may contain arbitrary bytes, but ASCII compatible text is more
497+
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
498+
* length should be the number of bytes in haystack.
499+
*
500+
* matches must be greater than or equal to the number of patterns the
501+
* rure_set was compiled with.
502+
*
503+
* Only use this function if you specifically need to know which regexes
504+
* matched within the set. To determine if any of the regexes matched without
505+
* caring which, use rure_set_is_match.
506+
*/
507+
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
508+
const bool *matches);
509+
449510
/*
450511
* rure_error_new allocates space for an error.
451512
*

regex-capi/src/rure.rs

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ pub struct Options {
2121
dfa_size_limit: usize,
2222
}
2323

24+
pub struct RegexSet {
25+
re: bytes::RegexSet,
26+
pattern_count: usize
27+
}
28+
2429
const RURE_FLAG_CASEI: u32 = 1 << 0;
2530
const RURE_FLAG_MULTI: u32 = 1 << 1;
2631
const RURE_FLAG_DOTNL: u32 = 1 << 2;
@@ -54,6 +59,11 @@ impl Deref for Regex {
5459
fn deref(&self) -> &bytes::Regex { &self.re }
5560
}
5661

62+
impl Deref for RegexSet {
63+
type Target = bytes::RegexSet;
64+
fn deref(&self) -> &bytes::RegexSet { &self.re }
65+
}
66+
5767
impl Default for Options {
5868
fn default() -> Options {
5969
Options {
@@ -280,7 +290,7 @@ ffi_fn! {
280290
unsafe {
281291
let cs = match CString::new(cn.as_bytes()) {
282292
Result::Ok(val) => val,
283-
Result::Err(err) => return false
293+
Result::Err(_) => return false
284294
};
285295
let ptr = cs.into_raw();
286296
it.name_ptrs.push(ptr);
@@ -453,3 +463,94 @@ ffi_fn! {
453463
options.dfa_size_limit = limit;
454464
}
455465
}
466+
467+
ffi_fn! {
468+
fn rure_compile_set(
469+
patterns: *const *const u8,
470+
patterns_lengths: *const size_t,
471+
patterns_count: size_t,
472+
error: *mut Error
473+
) -> *const RegexSet {
474+
let (raw_pats, raw_patsl) = unsafe {
475+
(
476+
slice::from_raw_parts(patterns, patterns_count),
477+
slice::from_raw_parts(patterns_lengths, patterns_count)
478+
)
479+
};
480+
481+
let mut pats = Vec::with_capacity(patterns_count);
482+
for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) {
483+
let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) };
484+
pats.push(match str::from_utf8(pat) {
485+
Ok(pat) => pat,
486+
Err(err) => {
487+
unsafe {
488+
if !error.is_null() {
489+
*error = Error::new(ErrorKind::Str(err));
490+
}
491+
return ptr::null();
492+
}
493+
}
494+
});
495+
}
496+
497+
match bytes::RegexSet::new(&pats) {
498+
Ok(re) => {
499+
let pat_count = re.len();
500+
let re = RegexSet {
501+
re: re,
502+
pattern_count: pat_count
503+
};
504+
Box::into_raw(Box::new(re))
505+
}
506+
Err(err) => {
507+
unsafe {
508+
if !error.is_null() {
509+
*error = Error::new(ErrorKind::Regex(err));
510+
}
511+
ptr::null()
512+
}
513+
}
514+
}
515+
}
516+
}
517+
518+
ffi_fn! {
519+
fn rure_set_free(re: *const RegexSet) {
520+
unsafe { Box::from_raw(re as *mut RegexSet); }
521+
}
522+
}
523+
524+
ffi_fn! {
525+
fn rure_set_is_match(
526+
re: *const RegexSet,
527+
haystack: *const u8,
528+
len: size_t
529+
) -> bool {
530+
let re = unsafe { &*re };
531+
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
532+
re.is_match(haystack)
533+
}
534+
}
535+
536+
ffi_fn! {
537+
fn rure_set_matches(
538+
re: *const RegexSet,
539+
haystack: *const u8,
540+
len: size_t,
541+
matches: *mut bool
542+
) -> bool {
543+
let re = unsafe { &*re };
544+
let mut results = unsafe {
545+
slice::from_raw_parts_mut(matches, re.pattern_count)
546+
};
547+
let haystack = unsafe { slice::from_raw_parts(haystack, len) };
548+
let matches = re.matches(haystack);
549+
550+
for i in 0..re.pattern_count {
551+
results[i] = matches.matched(i);
552+
}
553+
554+
matches.matched_any()
555+
}
556+
}

0 commit comments

Comments
 (0)