From 9e2a31e368bd0d94ec09677c771949af397e9536 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Thu, 31 Jul 2025 16:07:04 -0300 Subject: [PATCH 1/6] Check we can load duckdb extensions in a separate process --- R/aaa-utils.R | 38 +++++++++++++++++++++++++++++ R/retrieve.R | 2 +- R/store.R | 3 ++- man/ragnar_retrieve.Rd | 2 +- man/ragnar_retrieve_vss.Rd | 2 +- man/ragnar_retrieve_vss_and_bm25.Rd | 2 +- man/ragnar_store_create.Rd | 2 ++ tests/testthat/helper-doc.R | 8 ++++++ tests/testthat/test-extra-cols.R | 1 + tests/testthat/test-retrieve.R | 2 ++ tests/testthat/test-store.R | 2 ++ 11 files changed, 59 insertions(+), 5 deletions(-) diff --git a/R/aaa-utils.R b/R/aaa-utils.R index 5a26a5f..75e7b7f 100644 --- a/R/aaa-utils.R +++ b/R/aaa-utils.R @@ -281,3 +281,41 @@ prop_string <- function( ) ) } + +can_load_duckdb_extensions <- local({ + # DuckDB extensions are shared libraries downloaded when + # running `INSTAll `. They are pre-built by the DuckDB + # team https://github.com/duckdb/extension-ci-tools + # They are built for the major platforms using the standard + # compilers. + # One of the CRAN test server is a Linux machine with R compiled + # with clang instead of GCC. Turns since the compilers have different + # ABIs, a crash happens when trying to execute extensions that are + # pre-built on GCC. + # To avoid the crash on CRAN machines, we check if the extensions can + # be loaded in a separate process and proceed if that's possible. + can <- NULL + function() { + if (is.null(can)) { + can <<- 0 == system2( + rscript_exe(), + "-", + input = c( + "con <- DBI::dbConnect(duckdb::duckdb())", + "DBI::dbExecute(con, 'install fts; load fts;')" + ), + stderr = FALSE, + stdout = FALSE + ) + } + can + } +}) + +rscript_exe <- function() { + file.path( + R.home("bin"), + if (is_windows()) "Rscript.exe" else "Rscript" + ) +} + diff --git a/R/retrieve.R b/R/retrieve.R index f205204..850ea7d 100644 --- a/R/retrieve.R +++ b/R/retrieve.R @@ -484,7 +484,7 @@ ragnar_retrieve_vss_and_bm25 <- function(store, text, top_k = 3, ...) { #' represents a chunk and always contains a `text` column. #' #' @family ragnar_retrieve -#' @examplesIf (rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY"))) +#' @examplesIf (rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions()) #' ## Build a small store with categories #' store <- ragnar_store_create( #' embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/R/store.R b/R/store.R index 6634332..17c4a51 100644 --- a/R/store.R +++ b/R/store.R @@ -55,7 +55,7 @@ #' #' @returns a `RagnarStore` object #' @export -#' @examples +#' @examplesIf ragnar:::can_load_duckdb_extensions() #' # A store with a dummy embedding #' store <- ragnar_store_create( #' embed = \(x) matrix(stats::runif(10), nrow = length(x), ncol = 10), @@ -239,6 +239,7 @@ ragnar_store_connect <- function( stop("Store must be created with ragnar_store_create()") } + stop("a") dbExecute(con, "INSTALL fts; INSTALL vss;") dbExecute(con, "LOAD fts; LOAD vss;") diff --git a/man/ragnar_retrieve.Rd b/man/ragnar_retrieve.Rd index 14edc80..807c6b3 100644 --- a/man/ragnar_retrieve.Rd +++ b/man/ragnar_retrieve.Rd @@ -30,7 +30,7 @@ union of chunks retrieved by both methods. The results are not re-ranked after identifying the unique values. } \examples{ -\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions())) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Build a small store with categories store <- ragnar_store_create( embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/man/ragnar_retrieve_vss.Rd b/man/ragnar_retrieve_vss.Rd index 022b11f..264dbaa 100644 --- a/man/ragnar_retrieve_vss.Rd +++ b/man/ragnar_retrieve_vss.Rd @@ -60,7 +60,7 @@ sets or filtered queries. The results are not re-ranked after identifying the unique values. } \examples{ -\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions())) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Build a small store with categories store <- ragnar_store_create( embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/man/ragnar_retrieve_vss_and_bm25.Rd b/man/ragnar_retrieve_vss_and_bm25.Rd index 33483f2..14d51c6 100644 --- a/man/ragnar_retrieve_vss_and_bm25.Rd +++ b/man/ragnar_retrieve_vss_and_bm25.Rd @@ -27,7 +27,7 @@ documents. The results are not re-ranked after identifying the unique values. } \examples{ -\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if ((rlang::is_installed("dbplyr") && nzchar(Sys.getenv("OPENAI_API_KEY")) && ragnar:::can_load_duckdb_extensions())) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Build a small store with categories store <- ragnar_store_create( embed = \(x) ragnar::embed_openai(x, model = "text-embedding-3-small"), diff --git a/man/ragnar_store_create.Rd b/man/ragnar_store_create.Rd index de2db7b..4ed984b 100644 --- a/man/ragnar_store_create.Rd +++ b/man/ragnar_store_create.Rd @@ -90,6 +90,7 @@ chunk. The easiest way to prepare \code{chunks} for \code{version = 1} is with } } \examples{ +\dontshow{if (ragnar:::can_load_duckdb_extensions()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # A store with a dummy embedding store <- ragnar_store_create( embed = \(x) matrix(stats::runif(10), nrow = length(x), ncol = 10), @@ -133,4 +134,5 @@ ragnar_store_build_index(store) ragnar_retrieve(store, "abc bcd xyz", deoverlap = FALSE) ragnar_retrieve(store, "abc bcd xyz", deoverlap = TRUE) +\dontshow{\}) # examplesIf} } diff --git a/tests/testthat/helper-doc.R b/tests/testthat/helper-doc.R index 487205d..523ee47 100644 --- a/tests/testthat/helper-doc.R +++ b/tests/testthat/helper-doc.R @@ -33,7 +33,15 @@ skip_on_cran <- function() { skip_if(maybe_on_cran(), "Maybe On CRAN") } +skip_if_cant_load_duckdb_extensions <- function() { + if (!can_load_duckdb_extensions()) { + testthat::skip("DuckDB extensions cannot be loaded") + } + testthat::skip("a") +} + skip_if_cant_use_motherduck <- function() { + skip_if_cant_load_duckdb_extensions() if (Sys.getenv("motherduck_token") == "") { testthat::skip("motherduck_token not set") } diff --git a/tests/testthat/test-extra-cols.R b/tests/testthat/test-extra-cols.R index 331c620..747621d 100644 --- a/tests/testthat/test-extra-cols.R +++ b/tests/testthat/test-extra-cols.R @@ -1,5 +1,6 @@ test_that("extra cols works", { skip_on_cran() # See comment in test-retrieve.R and test-read-markdown.R + skip_if_cant_load_duckdb_extensions() store <- ragnar_store_create( version = 2, embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)), diff --git a/tests/testthat/test-retrieve.R b/tests/testthat/test-retrieve.R index d9184fa..4783f9a 100644 --- a/tests/testthat/test-retrieve.R +++ b/tests/testthat/test-retrieve.R @@ -5,6 +5,7 @@ system.time(test_that("retrieving works as expected, v1", { # > Running R code in 'testthat.R' had CPU time 2.6 times elapsed time # Unfortunately, this means we can't test properly on CRAN. skip_on_cran() + skip_if_cant_load_duckdb_extensions() # Create a simple store and insert some chunks store <- ragnar_store_create( @@ -57,6 +58,7 @@ system.time(test_that("retrieving works as expected, v1", { test_that("retrieving works as expected", { skip_on_cran() # See comment (above) in test-retrieve.R + skip_if_cant_load_duckdb_extensions() # Create a simple store and insert some chunks store <- ragnar_store_create( embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)) diff --git a/tests/testthat/test-store.R b/tests/testthat/test-store.R index b7c2832..57c6a9b 100644 --- a/tests/testthat/test-store.R +++ b/tests/testthat/test-store.R @@ -222,6 +222,7 @@ test_that("additional columns", { test_that("Allow a NULL embedding function", { skip_on_cran() # See comment in test-retrieve.R + skip_if_cant_load_duckdb_extensions() store <- ragnar_store_create(embed = NULL, version = 1) maybe_set_threads(store) chunks <- data.frame( @@ -302,6 +303,7 @@ test_that("embed functions get the defaults stored", { test_that("store v1 accepts markdown chunks (from v2)", { skip_on_cran() # See comment in test-retrieve.R + skip_if_cant_load_duckdb_extensions() store <- ragnar_store_create( version = 1, embed = \(x) matrix(nrow = length(x), ncol = 100, stats::runif(100)) From a9c6ffb140e34f5d68f0429dca15febadff51841 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Thu, 31 Jul 2025 16:15:09 -0300 Subject: [PATCH 2/6] Don't skip! --- tests/testthat/helper-doc.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/helper-doc.R b/tests/testthat/helper-doc.R index 523ee47..4c1a0c0 100644 --- a/tests/testthat/helper-doc.R +++ b/tests/testthat/helper-doc.R @@ -37,7 +37,6 @@ skip_if_cant_load_duckdb_extensions <- function() { if (!can_load_duckdb_extensions()) { testthat::skip("DuckDB extensions cannot be loaded") } - testthat::skip("a") } skip_if_cant_use_motherduck <- function() { From bdecbecd97621f9b791fb87d3b32919c228b7486 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Tue, 5 Aug 2025 15:31:41 -0300 Subject: [PATCH 3/6] Add NEWS bullet --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index e3ab052..ad1105d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,10 @@ * `ragnar_inspector()` now renders all urls as clickable links in the chunk markdown viewer, even if url is not a formal markdown link (#82). +* Before running examples and tests we now check if ragnar can load DuckDB extensions. + This fixes issues in environments where DuckDB pre-built binaries for extensions are not + compatible with the installed DuckDB version (#94). + # ragnar 0.2.0 * `ragnar_store_create()` gains a new argument: `version`, with default `2`. From 602e7b280abe1eb404c2c929ef4e62623e5c1dc6 Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Thu, 14 Aug 2025 09:30:42 -0400 Subject: [PATCH 4/6] load vss also in checker --- R/aaa-utils.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/aaa-utils.R b/R/aaa-utils.R index 75e7b7f..b8b7711 100644 --- a/R/aaa-utils.R +++ b/R/aaa-utils.R @@ -287,10 +287,10 @@ can_load_duckdb_extensions <- local({ # running `INSTAll `. They are pre-built by the DuckDB # team https://github.com/duckdb/extension-ci-tools # They are built for the major platforms using the standard - # compilers. + # compilers. # One of the CRAN test server is a Linux machine with R compiled # with clang instead of GCC. Turns since the compilers have different - # ABIs, a crash happens when trying to execute extensions that are + # ABIs, a crash happens when trying to execute extensions that are # pre-built on GCC. # To avoid the crash on CRAN machines, we check if the extensions can # be loaded in a separate process and proceed if that's possible. @@ -302,7 +302,8 @@ can_load_duckdb_extensions <- local({ "-", input = c( "con <- DBI::dbConnect(duckdb::duckdb())", - "DBI::dbExecute(con, 'install fts; load fts;')" + "DBI::dbExecute(con, 'INSTALL fts; LOAD fts;')", + "DBI::dbExecute(con, 'INSTALL vss; LOAD vss;')" ), stderr = FALSE, stdout = FALSE From dc7eb07bad3e6346773d356871566180c29e9b0d Mon Sep 17 00:00:00 2001 From: Tomasz Kalinowski Date: Thu, 14 Aug 2025 09:31:44 -0400 Subject: [PATCH 5/6] predownload duckdb extensions in configure --- configure | 2 +- configure.win | 2 +- tools/configure_deps.R | 48 ++++++++++++++++++++++++++++++++++++ tools/configure_reticulate.R | 18 -------------- 4 files changed, 50 insertions(+), 20 deletions(-) create mode 100644 tools/configure_deps.R delete mode 100644 tools/configure_reticulate.R diff --git a/configure b/configure index dd74c59..4e4cd7c 100755 --- a/configure +++ b/configure @@ -1,2 +1,2 @@ #!/bin/sh -"${R_HOME}/bin/Rscript" tools/configure_reticulate.R +"${R_HOME}/bin/Rscript" tools/configure_deps.R diff --git a/configure.win b/configure.win index dd74c59..4e4cd7c 100755 --- a/configure.win +++ b/configure.win @@ -1,2 +1,2 @@ #!/bin/sh -"${R_HOME}/bin/Rscript" tools/configure_reticulate.R +"${R_HOME}/bin/Rscript" tools/configure_deps.R diff --git a/tools/configure_deps.R b/tools/configure_deps.R new file mode 100644 index 0000000..291c010 --- /dev/null +++ b/tools/configure_deps.R @@ -0,0 +1,48 @@ +# tools/configure_reticulate.R + +# Make reticulate setup the ephemeral venv in advance, +# primarily so CRAN examples run quickly and don't trigger a warning + +Sys.setenv("RETICULATE_PYTHON" = "managed") + +library(reticulate) +py_require(c( + "markitdown[all]", + if (identical(.Platform$OS.type, "windows")) { + py_require("onnxruntime<=1.20.1") + } +)) +try({ + print(py_config()) + import("markitdown") +}) + + + +rscript_exe <- function() { + file.path( + R.home("bin"), + if (is_windows()) "Rscript.exe" else "Rscript" + ) +} + +load_duckdb_extensions_in_subprocess <- function() { + # download duckdb extensions (which are also cached by duckdb) + # same motivation as reticulate, avoid NOTE due to first-run download: + # 'Examples with CPU (user + system) or elapsed time > 5s + # We do this in a subprocess in case of segfaults with mismatched ABI, + # see comments in package code. + try(system2( + rscript_exe(), + "-", + input = c( + "con <- DBI::dbConnect(duckdb::duckdb())", + "DBI::dbExecute(con, 'INSTALL fts; INSTALL vss;')", + "DBI::dbExecute(con, 'LOAD fts; LOAD vss;')" + ) + )) +} + +load_duckdb_extensions_in_subprocess() + +NULL diff --git a/tools/configure_reticulate.R b/tools/configure_reticulate.R deleted file mode 100644 index 4b473a7..0000000 --- a/tools/configure_reticulate.R +++ /dev/null @@ -1,18 +0,0 @@ -# tools/configure_reticulate.R - -# Make reticulate setup the ephemeral venv in advance, -# primarily so CRAN examples run quickly and don't trigger a warning - -Sys.setenv("RETICULATE_PYTHON" = "managed") - -library(reticulate) -py_require(c( - "markitdown[all]", - if (identical(.Platform$OS.type, "windows")) { - py_require("onnxruntime<=1.20.1") - } -)) -try({ - print(py_config()) - import("markitdown") -}) From 748cca7a8d84e23235cbaee24c8380e4cf8fa815 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Thu, 14 Aug 2025 10:50:24 -0300 Subject: [PATCH 6/6] Update R/store.R --- R/store.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/store.R b/R/store.R index 17c4a51..3cc2590 100644 --- a/R/store.R +++ b/R/store.R @@ -239,7 +239,6 @@ ragnar_store_connect <- function( stop("Store must be created with ragnar_store_create()") } - stop("a") dbExecute(con, "INSTALL fts; INSTALL vss;") dbExecute(con, "LOAD fts; LOAD vss;")