Add the nocenter option to coxph. Pass R CMD check

therneau · Feb 15, 2021 · cd13496 · cd13496
1 parent 9c2a4be
commit cd13496
Show file tree

Hide file tree

Showing 36 changed files with 321 additions and 232 deletions.
diff --git a/R/agexact.fit.R b/R/agexact.fit.R
@@ -1,6 +1,6 @@
 agexact.fit <- function(x, y, strata, offset, init, control,
 			  weights, method, rownames,
-                         resid=TRUE)
+                         resid=TRUE, nocenter=NULL)
     {
     if (!is.matrix(x)) stop("Invalid formula for cox fitting function")
     if (!is.null(weights) && any(weights!=1))
@@ -44,6 +44,10 @@ agexact.fit <- function(x, y, strata, offset, init, control,
     }
     else init <- rep(0,nvar)
 
+    # 2012 change: individually choose which variable to rescale
+    # default: leave 0/1 variables along
+    if (is.null(nocenter)) zero.one <- rep(FALSE, ncol(x))
+    else zero.one <- apply(x, 2, function(z) all(z %in% nocenter))
     agfit <- .C(Cagexact, iter= as.integer(control$iter.max),
 		   as.integer(n),
 		   as.integer(nvar), sstart, sstop,
@@ -60,7 +64,8 @@ agexact.fit <- function(x, y, strata, offset, init, control,
 		   integer(2*n),
 		   as.double(control$eps),
 		   as.double(control$toler.chol),
-		   sctest=double(1))
+		   sctest=double(1), 
+                   ifelse(zero.one, 0L, 1L))
 
     var <- matrix(agfit$imat,nvar,nvar)
     coef <- agfit$coef

diff --git a/R/agreg.fit.R b/R/agreg.fit.R
@@ -1,6 +1,6 @@
 # Automatically generated from the noweb directory
 agreg.fit <- function(x, y, strata, offset, init, control,
-                        weights, method, rownames, resid=TRUE)
+                        weights, method, rownames, resid=TRUE, nocenter=NULL)
     {
     nvar <- ncol(x)
     event <- y[,3]
@@ -61,6 +61,11 @@ agreg.fit <- function(x, y, strata, offset, init, control,
         if (length(init) != nvar) stop("Wrong length for inital values")
         }
 
+    # 2021 change: pass in per covariate centering.  This gives
+    #  us more freedom to experiment.  Default is to leave 0/1 variables alone
+    if (is.null(nocenter)) zero.one <- rep(FALSE, ncol(x))
+    zero.one <- apply(x, 2, function(z) all(z %in% nocenter)) 
+
     # the returned value of agfit$coef starts as a copy of init, so make sure
     #  is is a vector and not a matrix; as.double suffices.
     # Solidify the storage mode of other arguments
@@ -75,7 +80,10 @@ agreg.fit <- function(x, y, strata, offset, init, control,
                    as.integer(maxiter), 
                    as.double(control$eps),
                    as.double(control$toler.chol),
-                   as.integer(1)) # internally rescale
+                   ifelse(zero.one, 0L, 1L))
+    # agfit4 centers variables within strata, so does not return a vector
+    #  of means.  Use a fill in consistent with other coxph routines
+    agmeans <- ifelse(zero.one, 0, colMeans(x))
 
     vmat <- agfit$imat
     coef <- agfit$coef
@@ -98,7 +106,7 @@ agreg.fit <- function(x, y, strata, offset, init, control,
                               "; beta may be infinite. "))
         }
     }
-    lp  <- as.vector(x %*% coef + offset - sum(coef * colMeans(x)))
+    lp  <- as.vector(x %*% coef + offset - sum(coef * agmeans))
     if (resid) {
         score <- as.double(exp(lp))
         residuals <- .Call(Cagmart3, nused,
@@ -134,7 +142,7 @@ agreg.fit <- function(x, y, strata, offset, init, control,
                          iter   = agfit$iter,
                          linear.predictors = as.vector(lp),
                          residuals = residuals, 
-                         means = colMeans(x),
+                         means = agmeans,
                          first = agfit$u,
                          info = flag,
                          method= method,
@@ -146,7 +154,7 @@ agreg.fit <- function(x, y, strata, offset, init, control,
                          score  = agfit$sctest,
                          iter   = agfit$iter,
                          linear.predictors = as.vector(lp),
-                         means = colMeans(x),
+                         means = agmeans,
                          first = agfit$u,
                          info = flag,
                          method = method,

diff --git a/R/coxexact.fit.R b/R/coxexact.fit.R
@@ -3,7 +3,7 @@
 #   case-control data.
 coxexact.fit <- function(x, y, strata, offset, init, control,
 			  weights, method, rownames,
-                         resid=TRUE)
+                         resid=TRUE, nocenter=NULL)
     {
     if (!is.matrix(x)) stop("Invalid formula for cox fitting function")
     if (!is.null(weights) && any(weights!=1))
@@ -50,10 +50,16 @@ coxexact.fit <- function(x, y, strata, offset, init, control,
     # Prescale the data set to improve numerical accuracy.
     #  We will undo the scaling before finishing up.
     newx <- scale(x[sorted,])
-#    newx <- scale(x, scale=NULL)  #debug
     rescale <- attr(newx, "scaled:scale")
     means   <- attr(newx, "scaled:center")
-
+    if (!is.null(nocenter)){
+       zero.one <- apply(x, 2, function(z) all(z %in% nocenter))
+       for (i in which(zero.one)) {
+           newx[,i] <- x[sorted,i]
+           rescale[i] <- 1.0
+           means[i]   <- 0.0
+       }
+    }
     cfit <- .Call(Ccoxexact, 
                   as.integer(maxiter),
                   as.double(y),  # interger data?  Just in case.

diff --git a/R/coxpenal.fit.R b/R/coxpenal.fit.R
@@ -3,7 +3,7 @@
 #
 coxpenal.fit <- function(x, y, strata, offset, init, control,
 			weights, method, rownames, 
-			pcols, pattr, assign) {
+			pcols, pattr, assign, nocenter=NULL) {
     eps <- control$eps
     n <-  nrow(y)
     if (is.matrix(x)) nvar <- ncol(x)
@@ -283,7 +283,12 @@ coxpenal.fit <- function(x, y, strata, offset, init, control,
 
     ## need the current environment for callbacks
     rho<-environment()
-
+
+    # 2012 change: individually choose which variable to recenter
+    # default: leave 0/1 variables along
+    if (is.null(nocenter)) zero.one <- rep(FALSE, ncol(x))
+    zero.one <- apply(x, 2, function(z) all(z %in% nocenter))
+
     #
     # Have C store the data, and get the loglik for beta=initial, frailty=0
     #
@@ -308,7 +313,8 @@ coxpenal.fit <- function(x, y, strata, offset, init, control,
                                as.integer(nfrail),
                                as.integer(frailx),
                                #R callback additions
-                               f.expr1,f.expr2,rho)
+                               f.expr1,f.expr2,rho,
+                               ifelse(zero.one, 0L, 1L))
     else       coxfit <- .C(Ccoxfit5a, 
                                as.integer(n),
                                as.integer(nvar), 
@@ -327,7 +333,8 @@ coxpenal.fit <- function(x, y, strata, offset, init, control,
                                as.integer(full.imat),
                                as.integer(nfrail),
                                as.integer(frailx),
-                               f.expr1,f.expr2,rho)
+                               f.expr1,f.expr2,rho,
+                               ifelse(zero.one, 0L, 1L))
 
     loglik0 <- coxfit$loglik
     means   <- coxfit$means

diff --git a/R/coxph.R b/R/coxph.R
@@ -4,7 +4,7 @@ coxph <- function(formula, data, weights, subset, na.action,
         init, control, ties= c("efron", "breslow", "exact"),
         singular.ok =TRUE,  robust,
         model=FALSE, x=FALSE, y=TRUE,  tt, method=ties, 
-        id, cluster, istate, statedata,...) {
+        id, cluster, istate, statedata, nocenter=c(-1, 0, 1), ...) {
 
     ties <- match.arg(ties)
     Call <- match.call()
@@ -545,23 +545,31 @@ coxph <- function(formula, data, weights, subset, na.action,
         fit <- coxpenal.fit(X, Y, istrat, offset, init=init,
                             control,
                             weights=weights, method=method,
-                            row.names(mf), pcols, pattr, assign)
+                            row.names(mf), pcols, pattr, assign, 
+                            nocenter= nocenter)
     }
     else {
+        rname <- row.names(mf)
+        if (multi) rname <- rname[xstack$rindex]
         if( method=="breslow" || method =="efron") {
-            if (grepl('right', type))  fitter <- get("coxph.fit")
-            else                 fitter <- get("agreg.fit")
+            if (grepl('right', type))  
+                fit <- coxph.fit(X, Y, istrat, offset, init, control, 
+                                 weights=weights, method=method, 
+                                 rname, nocenter=nocenter)
+            else  fit <- agreg.fit(X, Y, istrat, offset, init, control, 
+                                   weights=weights, method=method, 
+                                   rname, nocenter=nocenter)
         }
         else if (method=='exact') {
-            if (type== "right")  fitter <- get("coxexact.fit")
-            else  fitter <- get("agexact.fit")
+            if (type== "right")  
+                fit <- coxexact.fit(X, Y, istrat, offset, init, control, 
+                                    weights=weights, method=method, 
+                                    rname, nocenter=nocenter)
+            else fit <- agexact.fit(X, Y, istrat, offset, init, control, 
+                                    weights=weights, method=method, 
+                                    rname, nocenter=nocenter)
         }
         else stop(paste ("Unknown method", method))
-
-        rname <- row.names(mf)
-        if (multi) rname <- rname[xstack$rindex]
-        fit <- fitter(X, Y, istrat, offset, init, control, weights=weights,
-                      method=method, rname)
     }
     if (is.character(fit)) {
         fit <- list(fail=fit)

diff --git a/R/coxph.fit.R b/R/coxph.fit.R
@@ -1,5 +1,6 @@
 coxph.fit <- function(x, y, strata, offset, init, control,
-			weights, method, rownames, resid=TRUE)
+                      weights, method, rownames, resid=TRUE, 
+                      nocenter=NULL)
 {
     n <-  nrow(y)
     if (is.matrix(x)) nvar <- ncol(x)
@@ -49,6 +50,11 @@ coxph.fit <- function(x, y, strata, offset, init, control,
 	else init <- rep(0,nvar)
 	}
 
+    # 2012 change: individually choose which variable to rescale
+    # default: leave 0/1 variables along
+    if (is.null(nocenter)) zero.one <- rep(FALSE, ncol(x))
+    else zero.one <- apply(x, 2, function(z) all(z %in% nocenter))
+
     storage.mode(weights) <- storage.mode(init) <- "double"
     coxfit <- .Call(Ccoxfit6, 
                      as.integer(maxiter),
@@ -62,7 +68,7 @@ coxph.fit <- function(x, y, strata, offset, init, control,
                      as.double(control$eps),
                      as.double(control$toler.chol),
                      as.vector(init),
-                     as.integer(1))  # internally rescale
+                     ifelse(zero.one, 0L, 1L))
 
     if (nullmodel) {
         if (resid) {

diff --git a/R/survfit.coxph.R b/R/survfit.coxph.R
@@ -35,6 +35,7 @@ survfit.coxph <-
           ctype <- c(1,1,2)[temp1]
       }
       else if (!(ctype %in% 1:2)) stop ("ctype must be 1 or 2")
+      if (!(stype %in% 1:2)) stop("stype must be 1 or 2")
 
       if (!se.fit) conf.type <- "none"
       else conf.type <- match.arg(conf.type)

diff --git a/R/survfit.coxphms.R b/R/survfit.coxphms.R
@@ -26,7 +26,8 @@ function(formula, newdata, se.fit=TRUE, conf.int=.95, individual=FALSE,
             index <- which(ctemp >0)
             baselinecoef[2, index] <- exp(object$coef[ctemp[index]])
         }
-    }       
+    } else phbase <- rep(FALSE, nrow(object$cmap))
+
     # process options, set up Y and the model frame, deal with start.time
     Terms  <- terms(object)
     robust <- !is.null(object$naive.var)   # did the coxph model use robust var?
@@ -371,7 +372,8 @@ function(formula, newdata, se.fit=TRUE, conf.int=.95, individual=FALSE,
     else {
         if (is.factor(strata)) ustrata <- levels(strata)
         else                   ustrata <- sort(unique(strata))
-        nstrata <- length(ustrata)
+        nstrata <- length(cifit$strata)
+        itemp <- rep(1:nstrata, cifit$strata)
         timelist <- split(cifit$time, itemp)
         ustrata <- names(cifit$strata)
         tfit <- vector("list", nstrata)
@@ -386,8 +388,8 @@ function(formula, newdata, se.fit=TRUE, conf.int=.95, individual=FALSE,
 
         # do.call(rbind) doesn't work for arrays, it loses a dimension
         ntime <- length(cifit$time)
-        cifit$pstate <- array(0., dim=c(ntime, dim(survlist[[1]]$pstate)[2:3]))
-        cifit$cumhaz <- array(0., dim=c(ntime, dim(survlist[[1]]$cumhaz)[2:3]))
+        cifit$pstate <- array(0., dim=c(ntime, dim(tfit[[1]]$pstate)[2:3]))
+        cifit$cumhaz <- array(0., dim=c(ntime, dim(tfit[[1]]$cumhaz)[2:3]))
         rtemp <- split(seq(along=cifit$time), itemp)
         for (i in 1:nstrata) {
             cifit$pstate[rtemp[[i]],,] <- tfit[[i]]$pstate
@@ -431,10 +433,10 @@ multihaz <- function(y, x, position, weight, risk, istrat, ctype, stype,
     }
 
     temp <- matrix(cn[,5] / denom1, ncol = fit$ntrans)
-    hazard <- temp[,bcoef[,1]] * rep(bcoef[,2], each=nrow(temp))
+    hazard <- temp[,bcoef[1,]] * rep(bcoef[2,], each=nrow(temp))
     if (se.fit) {
         temp <- matrix(cn[,5] / denom2, ncol = fit$ntrans)
-        varhaz <- temp[,bcoef[,1]] * rep(bcoef[,2]^2, each=nrow(temp))
+        varhaz <- temp[,bcoef[1,]] * rep(bcoef[2,]^2, each=nrow(temp))
     }
 
     # Expand the result, one "hazard set" for each row of x2

diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -10,9 +10,18 @@
     first rather than the last.
     \item Add the last test case for residuals.survfit (auc for tstart,
     tstop data), and yes, it uncovered an error.
-    \item Survit.coxph for a multistate fit with strata error, an array
+    \item Error in survit.coxph for a multistate fit with strata, an array
     dimension was wrong (but contents correct).  It led to downstream
     problems with print and plot.  Pointed out by Matthew Hall.
+    \item Add ability for a multistate coxph model to have shared
+    (proportional) baselines for transitions.
+    \item The coxph routine no longer recenters 0/1 predictors (fit$mean
+    will be 0), per the default value of the new \code{nocenter}
+    argument. Application of survfit to a coxph model without specifying
+    newdata remains a bad idea, but this change makes it a little
+    less bad since for factors the curve will refer to the reference
+    level.  The results of predict(fit) are now more defensible for a
+    factor variable.
 }}
 \section{Changes in version 3.2-8}{
   \itemize{

diff --git a/man/agreg.fit.Rd b/man/agreg.fit.Rd
@@ -11,9 +11,9 @@
 }
 \usage{
 agreg.fit(x, y, strata, offset, init, control, weights, method,
-rownames, resid=TRUE)
+rownames, resid=TRUE, nocenter=NULL)
 coxph.fit(x, y, strata, offset, init, control, weights, method,
-rownames, resid=TRUE)
+rownames, resid=TRUE, nocenter=NULL)
 }
 \arguments{
   \item{x}{Matix of predictors.  This should \emph{not} include an
@@ -31,6 +31,9 @@ rownames, resid=TRUE)
   \item{rownames}{this is only needed for a NULL model, in which case it
     contains the rownames (if any) of the original data.}
   \item{resid}{compute and return residuals.}
+  \item{nocenter}{an optional list of values. Any column of the X matrix
+    whose values lie strictly within that set will not be recentered.
+  Note that the coxph function has (-1, 0, 1) as the default.}
 }
 \details{
 This routine does no checking that arguments are the proper length or

diff --git a/man/coxph.Rd b/man/coxph.Rd
@@ -20,7 +20,7 @@ coxph(formula, data=, weights, subset,
       ties=c("efron","breslow","exact"), 
       singular.ok=TRUE, robust, 
       model=FALSE, x=FALSE, y=TRUE, tt, method=ties,
-      id, cluster, istate, statedata, ...)
+      id, cluster, istate, statedata, nocenter=c(-1, 0, 1), ...)
 }
 \arguments{
   \item{formula}{
@@ -107,6 +107,8 @@ coxph(formula, data=, weights, subset,
   }
   \item{tt}{optional list of time-transform functions.}
   \item{method}{alternate name for the \code{ties} argument.}
+  \item{nocenter}{columns of the X matrix whose values lie strictly
+      within this set are not recentered}
   \item{...}{Other arguments will be passed to \code{\link{coxph.control}}
   }
 }
@@ -143,6 +145,9 @@ of which applies to an interval of observation (start, stop].
 The routine internally scales and centers data to avoid overflow in
 the argument to the exponential function.  These actions do not change
 the result, but lead to more numerical stability.
+Any column of the X matrix whose values lie within \code{nocenter} list
+are not recentered.  The practical consequence of the default is to not
+recenter dummy variables corresponding to factors.
 However, arguments to offset are not scaled since there are situations
 where a large offset value is a purposefully used.
 In general, however, users should not avoid very large numeric values
@@ -196,7 +201,7 @@ cluster function would not be noticed.
 (Full qualification of the coxph call itself may be protective, however.)
 Second, and more importantly, the call just above will not give the
 correct answer.  The specials are recognized by their name, and
-`survival::cluster' is not the same as `cluster'; the above model would
+\code{survival::cluster} is not the same as \code{cluster}; the above model would
 treat \code{inst} as an ordinary variable.
 A similar issue arises from using \code{stats::offset} as a term, in
 either survival or glm models.

diff --git a/man/survival-internal.Rd b/man/survival-internal.Rd
@@ -26,10 +26,10 @@ match.ratetable(R, ratetable)
 coxpenal.df(hmat, hinv, fdiag, assign.list, ptype, nvar, pen1,
     pen2, sparse)
 coxpenal.fit(x, y, strata, offset, init, control, weights, method,
-    rownames, pcols, pattr, assign)
+    rownames, pcols, pattr, assign, nocenter)
 coxph.wtest(var, b, toler.chol = 1e-09)
 agexact.fit(x, y, strata, offset, init, control, weights, method,
-    rownames, resid=TRUE) 
+    rownames, resid=TRUE, nocenter=NULL) 
 survfitCI(X, Y, weights,  id, cluster, robust, istate,
                        stype=1, ctype=1, 
                        se.fit=TRUE,