strcapture improvements from Tim Taylor #29

tdhock · 2024-02-14T03:29:13Z

On R-devel @TimTaylor proposed

strcapture2 <- function(pattern, x, proto, perl = FALSE, useBytes = FALSE) {
    if (isTRUE(perl)) {
        m <- regexpr(pattern = pattern, text = x, perl = TRUE, useBytes = useBytes)
        nomatch <- [is.na](http://is.na/)(m) | m == -1L
        ntokens <- length(proto)
        if (any(!nomatch)) {
            length <- attr(m, "match.length")
            start <- attr(m, "capture.start")
            length <- attr(m, "capture.length")
            end <- start + length - 1L
            end[nomatch, ] <- start[nomatch, ] <- NA
            res <- substring(x, start, end)
            out <- matrix(res, length(m))
            if (ncol(out) != ntokens) {
                stop("The number of captures in 'pattern' != 'length(proto)'")
            }
        } else {
            out <- matrix(NA_character_, length(m), ntokens)
        }
        utils:::conformToProto(out,proto)
    } else {
        strcapture(pattern,x,proto,perl,useBytes)
    }
}
notables <- c(
    "  Ben Franklin and Jefferson Davis",
    "\tMillard Fillmore",
    "Bob",
    NA_character_
)
regex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
proto = data.frame("", "")
lengths <- sort(outer(c(1, 2, 5), 10^(1:4)))
reps <- 20
time_strcapture <- function(text, length, regex, proto, reps) {
    text <- rep_len(text, length)
    str <- system.time(for (i in seq_len(reps)) strcapture(regex, text, proto, perl = TRUE))
    str2 <- system.time(for (i in seq_len(reps)) strcapture2(regex, text, proto, perl = TRUE))
    c(strcapture = str[["user.self"]], strcapture2 = str2[["user.self"]])
}
timings <- sapply(
    lengths,
    time_strcapture,
    text = notables, regex = regex, reps = reps, proto = proto
)

tdhock · 2024-02-14T03:43:09Z

source:

strcapture2 <- function(pattern, x, proto, perl = FALSE, useBytes = FALSE) {
    if (isTRUE(perl)) {
        m <- regexpr(pattern = pattern, text = x, perl = TRUE, useBytes = useBytes)
        nomatch <- is.na(m) | m == -1L
        ntokens <- length(proto)
        if (any(!nomatch)) {
            length <- attr(m, "match.length")
            start <- attr(m, "capture.start")
            length <- attr(m, "capture.length")
            end <- start + length - 1L
            end[nomatch, ] <- start[nomatch, ] <- NA
            res <- substring(x, start, end)
            out <- matrix(res, length(m))
            if (ncol(out) != ntokens) {
                stop("The number of captures in 'pattern' != 'length(proto)'")
            }
        } else {
            out <- matrix(NA_character_, length(m), ntokens)
        }
        utils:::conformToProto(out,proto)
    } else {
        strcapture(pattern,x,proto,perl,useBytes)
    }
}
notables <- c(
    "  Ben Franklin and Jefferson Davis",
    "\tMillard Fillmore",
    "Bob",
    NA_character_
)
regex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
proto = data.frame("", "")
atime.list <- atime::atime(
  setup={
    text <- rep_len(notables, N)
  },
  seconds.limit=1,
  strcapture=strcapture(regex, text, proto, perl = TRUE),
  strcapture2=strcapture2(regex, text, proto, perl = TRUE))
plot(atime.list)
ref.list <- atime::references_best(atime.list)
plot(ref.list)
pred.list <- predict(ref.list, seconds=1, kilobytes=1e3)
plot(pred.list)

tdhock closed this as completed Feb 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

strcapture improvements from Tim Taylor #29

strcapture improvements from Tim Taylor #29

tdhock commented Feb 14, 2024

tdhock commented Feb 14, 2024 •

edited

strcapture improvements from Tim Taylor #29

strcapture improvements from Tim Taylor #29

Comments

tdhock commented Feb 14, 2024

tdhock commented Feb 14, 2024 • edited

tdhock commented Feb 14, 2024 •

edited