diff --git a/NEWS.md b/NEWS.md index e3ab052..ad1105d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,10 @@ * `ragnar_inspector()` now renders all urls as clickable links in the chunk markdown viewer, even if url is not a formal markdown link (#82). +* Before running examples and tests we now check if ragnar can load DuckDB extensions. + This fixes issues in environments where DuckDB pre-built binaries for extensions are not + compatible with the installed DuckDB version (#94). + # ragnar 0.2.0 * `ragnar_store_create()` gains a new argument: `version`, with default `2`. diff --git a/R/aaa-utils.R b/R/aaa-utils.R index 5a26a5f..b8b7711 100644 --- a/R/aaa-utils.R +++ b/R/aaa-utils.R @@ -281,3 +281,42 @@ prop_string <- function( ) ) } + +can_load_duckdb_extensions <- local({ + # DuckDB extensions are shared libraries downloaded when + # running `INSTAll `. They are pre-built by the DuckDB + # team https://github.com/duckdb/extension-ci-tools + # They are built for the major platforms using the standard + # compilers. + # One of the CRAN test server is a Linux machine with R compiled + # with clang instead of GCC. Turns since the compilers have different + # ABIs, a crash happens when trying to execute extensions that are + # pre-built on GCC. + # To avoid the crash on CRAN machines, we check if the extensions can + # be loaded in a separate process and proceed if that's possible. + can <- NULL + function() { + if (is.null(can)) { + can <<- 0 == system2( + rscript_exe(), + "-", + input = c( + "con <- DBI::dbConnect(duckdb::duckdb())", + "DBI::dbExecute(con, 'INSTALL fts; LOAD fts;')", + "DBI::dbExecute(con, 'INSTALL vss; LOAD vss;')" + ), + stderr = FALSE, + stdout = FALSE + ) + } + can + } +}) + +rscript_exe <- function() { + file.path( + R.home("bin"), + if (is_windows()) "Rscript.exe" else "Rscript" + ) +} + diff --git a/R/retrieve.R b/R/retrieve.R index f205204..850ea7d 100644 --- a/R/retrieve.R +++ b/R/retrieve.R @@ -484,7 +484,7 @@ ragnar_retrieve_vss_and_bm25 <- function(store, text, top_k = 3, ...) { #' represents a chunk and always contains a `text` column. #' #' @family ragnar_retrieve -#' @examplesIf (rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY"))) +#' @examplesIf (rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions()) #' ## Build a small store with categories #' store <- ragnar_store_create( #' embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/R/store.R b/R/store.R index 6634332..3cc2590 100644 --- a/R/store.R +++ b/R/store.R @@ -55,7 +55,7 @@ #' #' @returns a `RagnarStore` object #' @export -#' @examples +#' @examplesIf ragnar:::can_load_duckdb_extensions() #' # A store with a dummy embedding #' store <- ragnar_store_create( #' embed = \(x) matrix(stats::runif(10), nrow = length(x), ncol = 10), diff --git a/configure b/configure index dd74c59..4e4cd7c 100755 --- a/configure +++ b/configure @@ -1,2 +1,2 @@ #!/bin/sh -"${R_HOME}/bin/Rscript" tools/configure_reticulate.R +"${R_HOME}/bin/Rscript" tools/configure_deps.R diff --git a/configure.win b/configure.win index dd74c59..4e4cd7c 100755 --- a/configure.win +++ b/configure.win @@ -1,2 +1,2 @@ #!/bin/sh -"${R_HOME}/bin/Rscript" tools/configure_reticulate.R +"${R_HOME}/bin/Rscript" tools/configure_deps.R diff --git a/man/ragnar_retrieve.Rd b/man/ragnar_retrieve.Rd index 14edc80..807c6b3 100644 --- a/man/ragnar_retrieve.Rd +++ b/man/ragnar_retrieve.Rd @@ -30,7 +30,7 @@ union of chunks retrieved by both methods. The results are not re-ranked after identifying the unique values. } \examples{ -\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions())) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Build a small store with categories store <- ragnar_store_create( embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/man/ragnar_retrieve_vss.Rd b/man/ragnar_retrieve_vss.Rd index 022b11f..264dbaa 100644 --- a/man/ragnar_retrieve_vss.Rd +++ b/man/ragnar_retrieve_vss.Rd @@ -60,7 +60,7 @@ sets or filtered queries. The results are not re-ranked after identifying the unique values. } \examples{ -\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions())) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Build a small store with categories store <- ragnar_store_create( embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/man/ragnar_retrieve_vss_and_bm25.Rd b/man/ragnar_retrieve_vss_and_bm25.Rd index 33483f2..14d51c6 100644 --- a/man/ragnar_retrieve_vss_and_bm25.Rd +++ b/man/ragnar_retrieve_vss_and_bm25.Rd @@ -27,7 +27,7 @@ documents. The results are not re-ranked after identifying the unique values. } \examples{ -\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions())) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Build a small store with categories store <- ragnar_store_create( embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/man/ragnar_store_create.Rd b/man/ragnar_store_create.Rd index de2db7b..4ed984b 100644 --- a/man/ragnar_store_create.Rd +++ b/man/ragnar_store_create.Rd @@ -90,6 +90,7 @@ chunk. The easiest way to prepare \code{chunks} for \code{version = 1} is with } } \examples{ +\dontshow{if (ragnar:::can_load_duckdb_extensions()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # A store with a dummy embedding store <- ragnar_store_create( embed = \(x) matrix(stats::runif(10), nrow = length(x), ncol = 10), @@ -133,4 +134,5 @@ ragnar_store_build_index(store) ragnar_retrieve(store, "abc bcd xyz", deoverlap = FALSE) ragnar_retrieve(store, "abc bcd xyz", deoverlap = TRUE) +\dontshow{\}) # examplesIf} } diff --git a/tests/testthat/helper-doc.R b/tests/testthat/helper-doc.R index 487205d..4c1a0c0 100644 --- a/tests/testthat/helper-doc.R +++ b/tests/testthat/helper-doc.R @@ -33,7 +33,14 @@ skip_on_cran <- function() { skip_if(maybe_on_cran(), "Maybe On CRAN") } +skip_if_cant_load_duckdb_extensions <- function() { + if (!can_load_duckdb_extensions()) { + testthat::skip("DuckDB extensions cannot be loaded") + } +} + skip_if_cant_use_motherduck <- function() { + skip_if_cant_load_duckdb_extensions() if (Sys.getenv("motherduck_token") == "") { testthat::skip("motherduck_token not set") } diff --git a/tests/testthat/test-extra-cols.R b/tests/testthat/test-extra-cols.R index 331c620..747621d 100644 --- a/tests/testthat/test-extra-cols.R +++ b/tests/testthat/test-extra-cols.R @@ -1,5 +1,6 @@ test_that("extra cols works", { skip_on_cran() # See comment in test-retrieve.R and test-read-markdown.R + skip_if_cant_load_duckdb_extensions() store <- ragnar_store_create( version = 2, embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)), diff --git a/tests/testthat/test-retrieve.R b/tests/testthat/test-retrieve.R index d9184fa..4783f9a 100644 --- a/tests/testthat/test-retrieve.R +++ b/tests/testthat/test-retrieve.R @@ -5,6 +5,7 @@ system.time(test_that("retrieving works as expected, v1", { # > Running R code in 'testthat.R' had CPU time 2.6 times elapsed time # Unfortunately, this means we can't test properly on CRAN. skip_on_cran() + skip_if_cant_load_duckdb_extensions() # Create a simple store and insert some chunks store <- ragnar_store_create( @@ -57,6 +58,7 @@ system.time(test_that("retrieving works as expected, v1", { test_that("retrieving works as expected", { skip_on_cran() # See comment (above) in test-retrieve.R + skip_if_cant_load_duckdb_extensions() # Create a simple store and insert some chunks store <- ragnar_store_create( embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)) diff --git a/tests/testthat/test-store.R b/tests/testthat/test-store.R index b7c2832..57c6a9b 100644 --- a/tests/testthat/test-store.R +++ b/tests/testthat/test-store.R @@ -222,6 +222,7 @@ test_that("additional columns", { test_that("Allow a NULL embedding function", { skip_on_cran() # See comment in test-retrieve.R + skip_if_cant_load_duckdb_extensions() store <- ragnar_store_create(embed = NULL, version = 1) maybe_set_threads(store) chunks <- data.frame( @@ -302,6 +303,7 @@ test_that("embed functions get the defaults stored", { test_that("store v1 accepts markdown chunks (from v2)", { skip_on_cran() # See comment in test-retrieve.R + skip_if_cant_load_duckdb_extensions() store <- ragnar_store_create( version = 1, embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)) diff --git a/tools/configure_deps.R b/tools/configure_deps.R new file mode 100644 index 0000000..291c010 --- /dev/null +++ b/tools/configure_deps.R @@ -0,0 +1,48 @@ +# tools/configure_reticulate.R + +# Make reticulate setup the ephemeral venv in advance, +# primarily so CRAN examples run quickly and don't trigger a warning + +Sys.setenv("RETICULATE_PYTHON" = "managed") + +library(reticulate) +py_require(c( + "markitdown[all]", + if (identical(.Platform$OS.type, "windows")) { + py_require("onnxruntime<=1.20.1") + } +)) +try({ + print(py_config()) + import("markitdown") +}) + + + +rscript_exe <- function() { + file.path( + R.home("bin"), + if (is_windows()) "Rscript.exe" else "Rscript" + ) +} + +load_duckdb_extensions_in_subprocess <- function() { + # download duckdb extensions (which are also cached by duckdb) + # same motivation as reticulate, avoid NOTE due to first-run download: + # 'Examples with CPU (user + system) or elapsed time > 5s + # We do this in a subprocess in case of segfaults with mismatched ABI, + # see comments in package code. + try(system2( + rscript_exe(), + "-", + input = c( + "con <- DBI::dbConnect(duckdb::duckdb())", + "DBI::dbExecute(con, 'INSTALL fts; INSTALL vss;')", + "DBI::dbExecute(con, 'LOAD fts; LOAD vss;')" + ) + )) +} + +load_duckdb_extensions_in_subprocess() + +NULL diff --git a/tools/configure_reticulate.R b/tools/configure_reticulate.R deleted file mode 100644 index 4b473a7..0000000 --- a/tools/configure_reticulate.R +++ /dev/null @@ -1,18 +0,0 @@ -# tools/configure_reticulate.R - -# Make reticulate setup the ephemeral venv in advance, -# primarily so CRAN examples run quickly and don't trigger a warning - -Sys.setenv("RETICULATE_PYTHON" = "managed") - -library(reticulate) -py_require(c( - "markitdown[all]", - if (identical(.Platform$OS.type, "windows")) { - py_require("onnxruntime<=1.20.1") - } -)) -try({ - print(py_config()) - import("markitdown") -})