/
utils-data-processing.R
373 lines (338 loc) · 12.2 KB
/
utils-data-processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#' @title Check that the times of a dataset are evenly sampled
#' @aliases is_evenly_sampled
#'
#' @param data dataset to check
#' @param period period to check the times against (if `NULL`, first check to
#' see if there is a known `period` set in the metadata, otherwise assumes 1)
#' @param tol tolerance for the period
#'
#' @return `TRUE` or `FALSE`
#'
#' @export
is_equitimed <- function(data, period = NULL, tol = 1e-06)
{
stopifnot(check_data_format(data))
full_times <- get_full_times(data, period, tol)
times <- get_times_from_data(data)
isTRUE(all.equal(times, full_times))
}
#' @export
is_evenly_sampled <- is_equitimed
#' @title Insert rows if necessary so that time series are evenly sampled
#' @aliases make_evenly_sampled
#'
#' @param data dataset to modify
#' @inheritParams is_equitimed
#' @param method one of `c("mean", "method", "closest")` that determines how
#' the rows of the original data will get coerced into the output here.
#' @inheritParams base::mean
#'
#' @return the dataset, with rows coerced according to the equitimed time
#' indices, and additional empty rows inserted if needed
#'
#' @details First, `get_full_times()` computes the sequence of time index values
#' at a regular sampling interval of period. These will be the final time
#' index values for the output. *Some* set of rows of the original dataset
#' will map to each of these time indices.
#'
#' The `method` argument determines how these rows get coerced:
#' \describe{
#' \item{mean}{the values in the rows are averaged together using `mean`}
#' \item{median}{the values in the rows are averaged together using `median`}
#' \item{closest}{the values in the row that is closest in time to the
#' desired time index are used.}
#' }
#'
#' @export
make_equitimed <- function(data, period = NULL, tol = 1e-06,
method = c("mean", "method", "closest"),
na.rm = TRUE)
{
stopifnot(check_data_format(data))
full_times <- get_full_times(data, period, tol)
if (is.null(full_times))
{
stop("Unable to construct an evenly spaced time index.")
}
times <- get_times_from_data(data)
if (isTRUE(all.equal(times, full_times)))
{
message("Dataset is already evenly sampled in time.")
return(invisible(data))
}
# generate empty matrices to hold final abundance and covariates
abundance <- matrix(NA, nrow = length(full_times), ncol = NCOL(data$abundance))
covariates <- data$covariates[0, , drop = FALSE]
# compute separation between times and full_times
times_dist <- outer(times, full_times, function(a, b) {abs(b - a)})
# fill abundance and covariates
method <- match.arg(method)
switch(method,
mean = {
idx <- times_dist <= tol
for (i in seq_along(full_times))
{
abundance[i, ] <- colMeans(data$abundance[idx[, i], , drop = FALSE], na.rm = na.rm)
covariates[i, ] <- purrr::map_dfc(data$covariates[idx[, i], , drop = FALSE], mean, na.rm = TRUE)
}
},
median = {
idx <- times_dist <= tol
for (i in seq_along(full_times))
{
abundance[i, ] <- apply(data$abundance[idx[, i], , drop = FALSE], 2, median, na.rm = na.rm)
covariates[i, ] <- purrr::map_dfc(data$covariates[idx[, i], , drop = FALSE], median, na.rm = TRUE)
}
},
closest = {
idx <- apply(times_dist, 2, which.min)
abundance <- data$abundance[idx,]
covariates <- data$covariates[idx,]
})
# restore column names and convert to tibbles
colnames(abundance) <- colnames(data$abundance)
abundance <- tibble::as_tibble(abundance)
covariates <- tibble::as_tibble(covariates)
# make sure times column is properly filled
time_var <- resolve_covariate_variable(data, "timename")
if (is.null(time_var))
{
# make sure timename variable is unique
new_col_names <- vctrs::vec_as_names(c(colnames(covariates), "time"),
repair = "unique", quiet = TRUE)
time_var <- tail(new_col_names, 1)
data$metadata$timename <- time_var
}
covariates[time_var] <- full_times
# assemble data to return
out <- list(abundance = abundance,
covariates = covariates,
metadata = data$metadata)
attr(out, "class") <- "matssdata"
return(out)
}
#' @export
make_evenly_sampled <- make_equitimed
#' @title Check if a dataset has integer times
#'
#' @param data dataset to check
#'
#' @return `TRUE` or `FALSE`
#'
#' @details If the times are already integer or Date, true. Otherwise FALSE,
#' with a message if times are missing, or if times could potentially be
#' rounded.
#'
#' @export
has_integer_times <- function(data)
{
# check for existence of times
times <- get_times_from_data(data)
if (is.null(times))
{
message("Dataset is missing times.")
return(FALSE)
}
# check for integer times
if (is.integer(times) || inherits(times, "Date"))
{
return(TRUE)
} else if (all(is.wholenumber(times))) {
message("Dataset has close to integer times, but they need to be rounded.\n",
"Perhaps you want to call ", usethis::ui_code("make_integer_times()"), ".\n")
return(FALSE)
}
# otherwise
return(FALSE)
}
#' @title Add a time variable with integer values for evenly sampled data
#'
#' @param data dataset to modify
#' @inheritParams is_equitimed
#'
#' @return the dataset, with integer times
#'
#' @details First, check if the data are evenly sampled in time. If not, we
#' exit early. Next, if the times are already integer or Date, we don't do
#' anything. If the times are numeric, but roundable to integer, we round.
#' Otherwise, we add a new variable to `covariates` from 1:n and designate
#' this variable as the `timename`.
#'
#' @export
make_integer_times <- function(data, period = NULL, tol = 1e-06)
{
times <- get_times_from_data(data)
# do checks based on existing times
if (!is.null(times))
{
# check for equitimed
if (!is_equitimed(data, period, tol))
{
stop(c("Dataset is not evenly sampled in time.\n",
"Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n"))
}
# check for integer times
if (is.integer(times))
{
message("Dataset is evenly sampled with integer times already.")
return(invisible(data))
} else if (inherits(times, "Date")) {
message("Dataset is evenly sampled with `Date` formatted times already.")
return(invisible(data))
} else if (all(is.wholenumber(times))) {
message("Dataset is evenly sampled with (close to) integer times already.")
message("Rounding times to integer and replacing them...")
time_var <- data$metadata$timename
data$covariates[time_var] <- as.integer(round(times))
return(invisible(data))
}
}
# add time
times <- seq_len(NROW(data$abundance))
if (is.null(data$covariates)) # create covariates
{
time_var <- "time"
data$covariates <- tibble::tibble(time_var = times)
} else {
new_col_names <- vctrs::vec_as_names(c(colnames(data$covariates), "time"),
repair = "unique", quiet = TRUE)
time_var <- tail(new_col_names, 1)
data$covariates[time_var] <- times
}
data$metadata$timename <- time_var
message("Integer times created in variable ", usethis::ui_code(time_var), ".")
return(invisible(data))
}
#' Check for missing samples
#' @aliases is_fully_sampled
#'
#' @description Some analyses may require evenly sampled data without missing
#' values. `has_missing_samples` checks that the dataset is equitimed, and
#' then for missing values within `abundance` (and optionally, `covariates`)
#'
#' `is_full_sampled()` does the same check, but returns `TRUE` if there are
#' NO missing samples.
#'
#' @inheritParams is_equitimed
#' @param check_covariates `TRUE` or `FALSE` (whether to check covariates, too)
#'
#' @return `TRUE` or `FALSE`
#'
#' @export
has_missing_samples <- function(data, period = NULL, tol = 1e-06,
check_covariates = FALSE)
{
if (!is_equitimed(data, period, tol))
{
message(c("Dataset is not evenly sampled in time.\n",
"Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n"))
return(TRUE)
}
# check abundance
if (any(is.na(data$abundance)))
{
message("Dataset has NA values in ", usethis::ui_code("abundance"), ".")
return(TRUE)
}
# check covariates
if (check_covariates && any(is.na(data$covariates)))
{
message("Dataset has NA values in ", usethis::ui_code("covariates"), ".")
return(TRUE)
}
return(FALSE)
}
#' @export
is_fully_sampled <- function(data, period = NULL, tol = 1e-06,
check_covariates = FALSE)
{
return(!has_missing_samples(data, period, tol, check_covariates))
}
#' @title Impute missing samples using linear interpolation
#'
#' @param data dataset to modify
#' @inheritParams is_equitimed
#' @param interpolate_covariates `TRUE` or `FALSE` (whether to do covariates, too)
#'
#' @return the dataset, with interpolated samples
#'
#' @details First, check if the data are evenly sampled in time. If not, we
#' exit early. Next, apply `forecast::na.interp()` to each variable that has
#' non-finite values.
#'
#' @export
interpolate_missing_samples <- function(data, period = NULL, tol = 1e-06,
interpolate_covariates = FALSE)
{
interpolate_tbl <- function(df)
{
finite_cols_idx <- apply(is.na(df), 2, all)
# replace all non finite values with NA
for (j in which(!finite_cols_idx))
{
x <- df[[j]]
x[!is.finite(x)] <- NA
interpolated <- forecast::na.interp(x)
df[[j]] <- as.numeric(interpolated)
class(df[[j]]) <- class(x)
}
return(df)
}
if (!is_equitimed(data, period, tol))
{
stop(c("Dataset is not evenly sampled in time.\n",
"Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n"))
}
data$abundance <- interpolate_tbl(data$abundance)
if (interpolate_covariates)
{
data$covariates <- interpolate_tbl(data$covariates)
}
return(invisible(data))
}
#' get the complete time index, filling in gaps where necessary, and using the
#' period to establish the sampling frequency
#'
#' @noRd
get_full_times <- function(data, period = NULL, tol = 1e-06)
{
times <- get_times_from_data(data)
if (is.null(times))
{
stop("Dataset does not appear to have a times variable.\n",
"Check", usethis::ui_code("covariates"), " and ",
usethis::ui_code("metadata$timename"), ".\n")
}
period <- get_period_from_data(data, period)
full_times <- tryCatch(tidyr::full_seq(times, period, tol),
error = function(e) {
message(e$message)
return(NULL)
})
return(full_times)
}
#' extract the period, given the value from the metadata field, and a value
#' specified by the user. The flowchart is:
#' (1) if user has supplied non-null `period`, use that
#' (2) if metadata period is non-null, use that
#' (3) use a default value of 1 and print a message
#'
#' @noRd
get_period_from_data <- function(data, period = NULL)
{
if (is.null(period))
{
period <- data$metadata$period
if (is.null(period))
{
message("No time period found. Assuming period = 1.")
period <- 1
}
}
return(period)
}
#' @noRd
is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5)
{
abs(x - round(x)) < tol
}