From 58481995f3237fede4d5fb6fe578c7776bea8f4b Mon Sep 17 00:00:00 2001 From: Bjorn Kallerud Date: Fri, 19 Sep 2025 11:32:09 -0700 Subject: [PATCH 1/2] Deprecate lag argument in initial_time_split and update NEWS --- NEWS.md | 2 ++ R/initial_split.R | 32 ++++++++++---------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5adedd6f..46c15dfb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # rsample (development version) +* The lag argument for initial_time_split() has been deprecated (#447). Supporting lags required overlapping rows between training and testing, which introduced data leakage (as discussed in #168). Users should instead pre-compute lagged variables before splitting, so that the test set remains strictly out-of-sample. + # rsample 1.3.1 * The new `internal_calibration_split()` function and its methods for various resamples is for usage in tune to create a internal split of the analysis set to fit the preprocessor and model on one part and the post-processor on the other part (#483, #488, #489, #569, #575, #577, #582). diff --git a/R/initial_split.R b/R/initial_split.R index d8626b4f..db493df8 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -26,13 +26,7 @@ #' drinks_split <- initial_time_split(drinks) #' train_data <- training(drinks_split) #' test_data <- testing(drinks_split) -#' c(max(train_data$date), min(test_data$date)) # no lag -#' -#' # With 12 period lag -#' drinks_lag_split <- initial_time_split(drinks, lag = 12) -#' train_data <- training(drinks_lag_split) -#' test_data <- testing(drinks_lag_split) -#' c(max(train_data$date), min(test_data$date)) # 12 period lag +#' c(max(train_data$date), min(test_data$date)) #' #' set.seed(1353) #' car_split <- group_initial_split(mtcars, cyl) @@ -76,27 +70,22 @@ initial_split <- function( } #' @rdname initial_split -#' @param lag A value to include a lag between the assessment -#' and analysis set. This is useful if lagged predictors will be used -#' during training and testing. +#' @param lag has been deprecated. #' @export -initial_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { +initial_time_split <- function(data, prop = 3 / 4, lag = lifecycle::deprecated(), ...) { check_dots_empty() check_prop(prop) - if (!is.numeric(lag) | !(lag %% 1 == 0)) { - cli_abort("{.arg lag} must be a whole number.") + if (lifecycle::is_present(lag)) { + lifecycle::deprecate_stop( + when = "1.4.0", + what = "initial_time_split(lag)" + ) } n_train <- floor(nrow(data) * prop) - if (lag > n_train) { - cli_abort( - "{.arg lag} must be less than or equal to the number of training observations." - ) - } - - split <- rsplit(data, 1:n_train, (n_train + 1 - lag):nrow(data)) + split <- rsplit(data, 1:n_train, (n_train + 1):nrow(data)) splits <- list(split) ids <- "Resample1" rset <- new_rset(splits, ids) @@ -104,8 +93,7 @@ initial_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { res <- rset$splits[[1]] attrib <- list( - prop = prop, - lag = lag + prop = prop ) for (i in names(attrib)) { attr(res, i) <- attrib[[i]] From 5ad2826aa0ac9b720bf8cee4acf4ffe649703a94 Mon Sep 17 00:00:00 2001 From: Bjorn Kallerud <35743670+bjornkallerud@users.noreply.github.com> Date: Fri, 19 Sep 2025 11:49:24 -0700 Subject: [PATCH 2/2] Update R/initial_split.R Accepted GitHub format suggestion. Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- R/initial_split.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/initial_split.R b/R/initial_split.R index db493df8..264f6e98 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -72,7 +72,12 @@ initial_split <- function( #' @rdname initial_split #' @param lag has been deprecated. #' @export -initial_time_split <- function(data, prop = 3 / 4, lag = lifecycle::deprecated(), ...) { +initial_time_split <- function( + data, + prop = 3 / 4, + lag = lifecycle::deprecated(), + ... +) { check_dots_empty() check_prop(prop)