Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign upchanges to preProcess in 6.0-57 causes models fit in 6.0-52 to mis-predict with no warning #282
Comments
|
Can you give is a test case to look at? |
|
Not easily. But I'm pretty sure that using the old version to fit any data that's not centered with a lm, using caret's center/scale functionality, then using save() to save the object, then updating the package, then trying to predict, will give incorrect results. |
|
Here is my testing: ## old.R
library(caret)
set.seed(1)
class_tr <- twoClassSim(200)
class_te <- twoClassSim(200)
set.seed(1)
reg_tr <- SLC14_1(200)
reg_te <- SLC14_1(200)
###################################################################
## classification
set.seed(2)
lda_mod_old <- train(Class ~ ., data = class_tr,
method = "lda",
preProc = c("center", "scale"),
trControl = trainControl(method = "cv", classProbs = TRUE))
lda_pred_old <- predict(lda_mod_old, class_te, type = "prob")
###################################################################
## regression
set.seed(2)
lm_mod_old <- train(y ~ ., data = reg_tr,
method = "lm",
preProc = c("center", "scale"),
trControl = trainControl(method = "cv"))
lm_pred_old <- predict(lm_mod_old, reg_te)
set.seed(2)
lm_mod2_old <- train(y ~ ., data = reg_tr,
method = "lm",
trControl = trainControl(method = "cv"))
lm_pred2_old <- predict(lm_mod2_old, reg_te)
info_old <- sessionInfo()
save(lda_mod_old, lda_pred_old,
lm_mod_old, lm_pred_old,
lm_mod2_old, lm_pred2_old,
info_old,
file = "~/tmp/old.RData")## new.R
library(caret)
set.seed(1)
class_tr <- twoClassSim(200)
class_te <- twoClassSim(200)
set.seed(1)
reg_tr <- SLC14_1(200)
reg_te <- SLC14_1(200)
###################################################################
## classification
set.seed(2)
lda_mod_new <- train(Class ~ ., data = class_tr,
method = "lda",
preProc = c("center", "scale"),
trControl = trainControl(method = "cv", classProbs = TRUE))
lda_pred_new <- predict(lda_mod_new, class_te, type = "prob")
###################################################################
## regression
set.seed(2)
lm_mod_new <- train(y ~ ., data = reg_tr,
method = "lm",
preProc = c("center", "scale"),
trControl = trainControl(method = "cv"))
lm_pred_new <- predict(lm_mod_new, reg_te)
set.seed(2)
lm_mod2_new <- train(y ~ ., data = reg_tr,
method = "lm",
trControl = trainControl(method = "cv"))
lm_pred2_new <- predict(lm_mod2_new, reg_te)
info_new <- sessionInfo()
save(lda_mod_new, lda_pred_new,
lm_mod_new, lm_pred_new,
lm_mod2_new, lm_pred2_new,
info_new,
file = "~/tmp/new.RData")Testing these shows the same results: > load("~/tmp/old.RData")
> load("~/tmp/new.RData")
>
> info_new$otherPkgs$caret$Version
[1] "6.0-57"
> info_old$otherPkgs$caret$Version
[1] "6.0-52"
>
> lm_mod_new$preProcess$method
$center
[1] "Var01" "Var02" "Var03" "Var04" "Var05" "Var06" "Var07" "Var08" "Var09"
[10] "Var10" "Var11" "Var12" "Var13" "Var14" "Var15" "Var16" "Var17" "Var18"
[19] "Var19" "Var20"
$scale
[1] "Var01" "Var02" "Var03" "Var04" "Var05" "Var06" "Var07" "Var08" "Var09"
[10] "Var10" "Var11" "Var12" "Var13" "Var14" "Var15" "Var16" "Var17" "Var18"
[19] "Var19" "Var20"
$ignore
character(0)
> lm_mod_old$preProcess$method
[1] "center" "scale"
>
> all.equal(lm_pred_new, lm_pred_old)
[1] TRUE
> all.equal(lda_pred_new, lda_pred_old)
[1] TRUE
> all.equal(lm_mod_new$results, lm_mod_old$results)
[1] TRUE
> all.equal(lda_mod_new$results, lda_mod_old$results)
[1] TRUE |
|
Ug, it's early and I just realized that this isn't what you were saying.. I'll do some more testing. |
|
Here is a fix that you can use in the meantime. convert_method <- function(x) {
new_method <- list()
if("center" %in% x$method) new_method$center <- names(x$mean)
if("scale" %in% x$method) new_method$scale <- names(x$std)
if("YeoJohnson" %in% x$method) new_method$YeoJohnson <- names(x$yj)
if("expoTrans" %in% x$method) new_method$expoTrans <- names(x$et)
if("BoxCox" %in% x$method) new_method$BoxCox <- names(x$bc)
if("knnImpute" %in% x$method) new_method$knnImpute <- names(x$mean)
if("bagImpute" %in% x$method) new_method$bagImpute <- names(x$bagImp)
if("medianImpute" %in% x$method) new_method$medianImpute <- names(x$median)
if("pca" %in% x$method) new_method$pca <- names(x$mean)
if("ica" %in% x$method) new_method$ica <- names(x$mean)
if("spatialSign" %in% x$method) new_method$spatialSign <- names(x$mean)
x$method <- new_method
x
}> lm_mod_old$preProcess$method
[1] "center" "scale"
> lm_mod_new$preProcess$method
$center
[1] "Var01" "Var02" "Var03" "Var04" "Var05" "Var06" "Var07" "Var08" "Var09"
[10] "Var10" "Var11" "Var12" "Var13" "Var14" "Var15" "Var16" "Var17" "Var18"
[19] "Var19" "Var20"
$scale
[1] "Var01" "Var02" "Var03" "Var04" "Var05" "Var06" "Var07" "Var08" "Var09"
[10] "Var10" "Var11" "Var12" "Var13" "Var14" "Var15" "Var16" "Var17" "Var18"
[19] "Var19" "Var20"
$ignore
character(0)
>
> lm_mod_fixed <- lm_mod_old
> lm_mod_fixed$preProcess <- convert_method(lm_mod_fixed$preProcess)
> lm_pred_fixed <- predict(lm_mod_fixed, reg_te)
> all.equal(lm_pred_fixed, lm_pred_new)
[1] TRUE
>
> lda_mod_fixed <- lda_mod_old
> lda_mod_fixed$preProcess <- convert_method(lda_mod_fixed$preProcess)
> lda_pred_fixed <- predict(lda_mod_fixed, class_te, type = "prob")
> all.equal(lda_pred_fixed, lda_pred_new)
[1] TRUEI'm putting together a new version for CRAN. Hopefully, they will not have a beef with a new version so soon after the last. Thanks, Max |
|
Thank you, Max! We really appreciate the help! (And all of your work on On Tue, Oct 20, 2015 at 11:46 AM, topepo notifications@github.com wrote:
|
|
No problem; sorry for the bug. There is a new version (6.0-58) on CRAN this morning. |
It looks to me like you introduced a breaking change in 6.0-57. Model objects that were created in 6.0-52 and saved/persisted then loaded in 6.0-52 mis-predict because of the changes to preProcess. Specifically, it looks like you change the way that center/scale information is stored, which means that the new version of the code does not detect center/scale, and so unprocessed inputs are applied, which causes wildly incorrect predictions with no error/warning to the user.
I don't quite know what a good fix is -- if there's a way to detect the old version of the object and fail early, that would have been vastly preferred.
https://github.com/topepo/caret/blame/861f0e41bf9b85b60b14cd377b395e963cf3de7b/pkg/caret/R/preProcess.R#L308