-
Notifications
You must be signed in to change notification settings - Fork 4
/
imputeCensored.R
242 lines (212 loc) · 8.72 KB
/
imputeCensored.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
###############################################################################
#' Impute Censored Values
#'
#' Impute value for multiply censored data.
#'
#' @param x vector of type survival::Surv
#' @param imputeOption imputation method [default= "mid"], valid impute options
#' are "lower", "upper", "mid", "norm", "lnorm"
#' @details
#' The imputeOption values of \code{"lower"}, \code{"upper"} and \code{"mid"}
#' impute the lower limit, upper limit, and midpoint between the lower and upper
#' limit. In the context of typical water quality data, these options would be
#' equivalent to zero, detection limit and 1/2 detection limit substitution.
#' Options for substituting the normal [\code{"norm"}] or lognormal
#' [\code{"lnorm"}] expectation can also be used.
#'
#' @examples
#' \dontrun{
#' x <- dataCensored[1:20,"tdp"]
#' x.lower <- impute(x,'lower')
#' x.mid <- impute(x,'mid')
#' x.upper <- impute(x,'upper')
#' x.norm <- impute(x,'norm')
#' x.lnorm <- impute(x,'lnorm')
#' }
#' @return vector where x is transformed into a simple numeric variable
#'
#' @seealso \code{\link{makeSurvDF}}, \code{\link{unSurvDF}}
#' , \code{\link{unSurv}}, \code{\link{imputeDF}}, \code{\link{imputeDF}},
#'
#' @export
#
impute <-function(x, imputeOption="mid") {
# ----- Change history --------------------------------------------
# 22May2020: JBH: Converted to survival::Surv objects
# 01Jun2018: JBH: Updated impute function to include maximum likelihood
# estimation assuming normal and lognormal distribution
# 01May2018: JBH: changed function name from .impute to impute; focus
# lower, mid and upper options; accomodate NAs
# 11Oct2016: JBH: corrected documentation, switched to explicit '::'
# for external function calls
# Initialization ####
# Set valid imputeOption
validImpute <- c("lower", "upper", "mid", "uniform", "norm", "lnorm")
# Error traps ####
# Error trap ... make sure x variable is a qw class
if (!(class(x) %in% c("Surv"))) {
stop("input vector not of type Surv.")
}
# Error trap ... make sure imputeOption is one of the valid choices
if (!(imputeOption %in% validImpute)) {
stop("imputeOption not valid.")
}
### If there are -Inf lower limits or Inf upper limits, then reset the
### lower and upper limits to address -Inf and Inf values
# extract lower and upper limits from Surv object
x <- unSurv(x)
if (any(x[,1] == -Inf, na.rm=TRUE) | any(x[,2] == Inf, na.rm=TRUE)) {
# pull out lower/upper values into easy-to-read vectors
lower <- x[,1]
upper <- x[,2]
# compute lower reset: 25% lower than smallest value
lowerReset1 <- min( lower [!(lower == -Inf)] , na.rm=TRUE) # smallest lower
lowerReset2 <- min( upper [!(upper == -Inf)] , na.rm=TRUE) # smallest upper
lowerReset <- min(lowerReset1, lowerReset2, na.rm=TRUE) -
0.25 * abs(min(lowerReset1, lowerReset2, na.rm=TRUE))
# compute upper reset: 25% larger than non-Inf value
upperReset1 <- max( lower [!(lower == Inf)], na.rm=TRUE)
# largest non-Inf lower
upperReset2 <- max( upper [!(upper == Inf)], na.rm=TRUE)
# largest non-Inf upper
upperReset <- max(upperReset1, upperReset2, na.rm=TRUE) +
0.25 * abs(max(upperReset1, upperReset2, na.rm=TRUE))
# apply lower and upper reset values
x[x[,1] == -Inf , 1] <- lowerReset
x[x[,2] == Inf , 2] <- upperReset
}
# Impute censored values ####
# 01May2018: update to accomodate vector with NA's
# recreate Surv object after addressing non-finite boundaries
x2 <- survival::Surv(time = x[,1], time2 = x[,2], type = "interval2")
# vector of NAs to receive imputted results
x3 <- rep(NA,nrow(x))
# logical vector identifying data presence
not.miss <- !is.na(x[,1])
# implement imputation, writing results into x3
if (imputeOption == "lower") {
x3 <- x[,1]
} else if (imputeOption == "upper") {
x3 <- x[,2]
} else if (imputeOption == "mid") {
x3 <- 0.5 * (x[,1] + x[,2])
} else if (imputeOption == "uniform") {
x3[not.miss] <- stats::runif(sum(not.miss), x[not.miss,1], x[not.miss,2])
} else if (imputeOption == "norm") {
myData1 <- data.frame(x=x2[not.miss], left=x[not.miss,1]
, right=x[not.miss,2])
fn1 <- fitdistrplus::fitdistcens(myData1, distr="norm")
mu.norm <- fn1$estimate[1]
sd.norm <- fn1$estimate[2]
myData1$normExpec <- .ExpNmCens(myData1,dep="x",mu.norm,sd.norm)[,1]
x3[not.miss] <- myData1$normExpec
} else if (imputeOption == "lnorm") {
myData1 <- data.frame(x=x2[not.miss], left=x[not.miss,1]
, right=x[not.miss,2])
fn1 <- fitdistrplus::fitdistcens(myData1, distr="lnorm")
mu.lnorm <- fn1$estimate[1]
sd.lnorm <- fn1$estimate[2]
myData1$lnormExpec <- .ExpLNmCens(myData1,dep="x",mu.lnorm,sd.lnorm)[,1]
x3[not.miss] <- myData1$lnormExpec
}
x <- x3
# Return ####
return(x)
}
###############################################################################
#' Impute Censored Values in dataframes
#'
#' Impute value for multiply censored data in data frames
#'
#' @param df dataframe
#' @param imputeOption imputation method [default= "mid"], valid impute options
#' are "lower", "upper", "mid", "norm", "lnorm"
#' @details
#' The imputeOption values of \code{"lower"}, \code{"upper"} and \code{"mid"}
#' impute the lower limit, upper limit, and midpoint between the lower and upper
#' limit. In the context of typical water quality data, these options would be
#' equivalent to zero, detection limit and 1/2 detection limit substitution.
#' Options for substituting the normal [\code{"norm"}] or lognormal
#' [\code{"lnorm"}] expectation can also be used.
#'
#' @examples
#' \dontrun{
#' df <- dataCensored[1:20, ]
#' df.lower <- imputeDF(df,'lower')
#' df.mid <- imputeDF(df,'mid')
#' df.upper <- imputeDF(df,'upper')
#' df.norm <- imputeDF(df,'norm')
#' df.lnorm <- imputeDF(df,'lnorm')
#' }
#' @return dataframe where fields with censored data (i.e., Surv objects) are
#' transformed into a simple numeric fields
#'
#' @seealso \code{\link{makeSurvDF}}, \code{\link{unSurvDF}}
#' , \code{\link{unSurv}}, \code{\link{impute}}, \code{\link{imputeDF}},
#'
#' @export
#
imputeDF <- function(df, imputeOption = "mid") {
# Initialization & Error Traps ####
{
# Set valid imputeOption
validImpute <- c("lower", "upper", "mid", "uniform", "norm", "lnorm")
# Error trap ... make sure df variable is a data frame
if (!(class(df) %in% c("data.frame"))) {
stop("input vector not of type Surv.")
}
# Error trap ... make sure imputeOption is one of the valid choices
if (!(imputeOption %in% validImpute)) {
stop("imputeOption not valid.")
}
}
# Convert and return data frame ####
{
i <- sapply(df,survival::is.Surv)
df2 <- cbind(df[!i], sapply(df[i], impute, imputeOption=imputeOption))
df2 <- df2[,names(df)]
return(df2)
}
}
#' # simCensored ####
#' #' Fit normal or log normal distribution; simulation in censored data
#' #' provided by Elgin Perry
#' #' phi.left and phi.right are distribution function, e.g.,
#' # pnorm(q=10,mean=10,sd=1) =0.5
#' #' phi.sim is a uniform random number between phi.left and phi.right
#' #' sim is imputed value
#' #'
#' #' @param x data
#' #' @param distr Distribution type; norm (default) or lnorm
#' #' @return qnorm (or qlnorm) values for input data
#' #' @keywords internal
#' #' @export
#' .simCensored <- function(x,distr="norm") {
#' # 12Mar2019: EWL: Added roxygen documentation
#' tmp <- data.frame(left = x@.Data[,1],
#' right = x@.Data[,2])
#' fn1 <- fitdistrplus::fitdistcens(tmp, distr=distr)
#' summary(fn1)
#' if(distr=="norm")
#' {
#' tmp$phi.left <- stats::pnorm(tmp$left,mean=fn1$estimate[1]
#' ,sd=fn1$estimate[2])
#' tmp$phi.right <- stats::pnorm(tmp$right,mean=fn1$estimate[1]
#' ,sd=fn1$estimate[2])
#' tmp$phi.sim <- stats::runif(nrow(tmp),tmp$phi.left,tmp$phi.right)
#' tmp$sim <- stats::qnorm(tmp$phi.sim,mean=fn1$estimate[1]
#' ,sd=fn1$estimate[2])
#' }
#' if(distr=="lnorm")
#' {
#' tmp$phi.left <- stats::plnorm(tmp$left,meanlog=fn1$estimate[1]
#' ,sdlog=fn1$estimate[2])
#' tmp$phi.right <- stats::plnorm(tmp$right,meanlog=fn1$estimate[1]
#' ,sdlog=fn1$estimate[2])
#' tmp$phi.sim <- stats::runif(nrow(tmp),tmp$phi.left,tmp$phi.right)
#' tmp$sim <- stats::qlnorm(tmp$phi.sim,meanlog=fn1$estimate[1]
#' ,sdlog=fn1$estimate[2])
#' }
#' return(tmp$sim)
#' }
#'