-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimpute-staged.R
359 lines (331 loc) · 12.8 KB
/
impute-staged.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#' Staged imputation
#'
#' Impute missing data at different utterance lengths using successive linear
#' models.
#'
#' @param data dataframe in which to impute missing value
#' @param var_y bare name of the response variable for imputation
#' @param var_length bare name of the length variable
#' @param id_cols a selection of variable names that uniquely identify each
#' group of related observations. For example, `c(child_id, age_months)`.
#' @param include_max_length whether to use the maximum length value as a
#' predictor in the imputation models. Defaults to `FALSE`.
#' @param data_train (optional) dataframe used to train the imputation models.
#' For example, we might have data from a reference group of children in
#' `data_train` but a clinical population in `data`. If omitted, the dataframe
#' in `data` is used to train the models.
#' @return a dataframe with the additional columns `{var_y}_imputed` (the
#' imputed value), `.max_{var_length}` with the highest value of `var_length`
#' with observed data, and `{var_y}_imputation` for labeling whether
#' observations were `"imputed"` or `"observed"`.
#' @export
#' @details
#'
#' # Background
#'
#' In Hustad and colleagues (2020), we modeled intelligibility data in young
#' children's speech. Children would hear an utterance and then they would
#' repeat it. The utterances started at 2 words in length, then increased to 3
#' words in length, and so on in batches of 10 sentences, all the way to 7
#' words in length. There was a problem, however: Not all of the children could
#' produce utterances at every length. Specifically, if a child could not
#' reliably produced 5 utterances of a given length length, the task was halted.
#' So given the nature of the task, if a child had produced 5-word utterances,
#' they also produced 2--4-word utterances as well.
#'
#' The length of the utterance probably influenced the outcome variable: Longer
#' utterances have more words that might help a listener understand the
#' sentence, for example. Therefore, it did not seem appropriate to ignore the
#' missing values. We used the following two-step procedure (see the
#' [Supplemental Materials](https://doi.org/10.23641/asha.12330956.v1) for more
#' detail):
#'
#'
#' ## Other notes
#'
#' Remark about `data` and `data_train`: One might ask, *why shouldn't some
#' children help train the data imputation models?* Let's consider a
#' norm-referenced standardized testing scenario: We have a new participant
#' (observations in `data`), and we want to know how they compare to their age
#' peers (participants in `data_train`). By separating out `data_train` and
#' fixing it to a reference group, we can apply the same adjustment/imputation
#' procedure to all new participants.
#'
#' @references Hustad, K. C., Mahr, T., Natzke, P. E. M., & Rathouz, P. J.
#' (2020). Development of Speech Intelligibility Between 30 and 47 Months in
#' Typically Developing Children: A Cross-Sectional Study of Growth. *Journal
#' of Speech, Language, and Hearing Research*, *63*(6), 1675–1687.
#' https://doi.org/10.1044/2020_JSLHR-20-00008
#'
#' Hustad, K. C., Mahr, T., Natzke, P. E. M., & J. Rathouz, P. (2020).
#' Supplemental Material S1 (Hustad et al., 2020). ASHA journals.
#' https://doi.org/10.23641/asha.12330956.v1
#'
#' @examples
#' set.seed(1)
#' fake_data <- tibble::tibble(
#' child = c(
#' "a", "a", "a", "a", "a",
#' "b", "b", "b", "b", "b",
#' "c", "c", "c", "c", "c",
#' "e", "e", "e", "e", "e",
#' "f", "f", "f", "f", "f",
#' "g", "g", "g", "g", "g",
#' "h", "h", "h", "h", "h",
#' "i", "i", "i", "i", "i"
#' ),
#' level = c(1:5, 1:5, 1:5, 1:5, 1:5, 1:5, 1:5, 1:5),
#' x = c(
#' c(100, 110, 120, 130, 150) + c(-8, -5, 0, NA, NA),
#' c(100, 110, 120, 130, 150) + c(6, 6, 4, NA, NA),
#' c(100, 110, 120, 130, 150) + c(-5, -5, -2, 2, NA),
#' c(100, 110, 120, 130, 150) + rbinom(5, 12, .5) - 6,
#' c(100, 110, 120, 130, 150) + rbinom(5, 12, .5) - 6,
#' c(100, 110, 120, 130, 150) + rbinom(5, 12, .5) - 6,
#' c(100, 110, 120, 130, 150) + rbinom(5, 12, .5) - 6,
#' c(100, 110, 120, 130, 150) + rbinom(5, 12, .5) - 6
#' )
#' )
#' data_imputed <- impute_values_by_length(
#' fake_data,
#' x,
#' level,
#' id_cols = c(child),
#' include_max_length = FALSE
#' )
#'
#' if (requireNamespace("ggplot2")) {
#' library(ggplot2)
#' ggplot(data_imputed) +
#' aes(x = level, y = x_imputed) +
#' geom_line(aes(group = child)) +
#' geom_point(aes(color = x_imputation))
#' }
impute_values_by_length <- function(
data,
var_y,
var_length,
id_cols = NULL,
include_max_length = FALSE,
data_train = NULL
) {
get_position_of_last_non_na_value <- function(xs) {
max(which(!is.na(xs)))
}
fit_imputation_models <- function(data, spec, pred_var_max_length) {
create_formulas <- function(spec, pred_var_max_length) {
n_models <- nrow(spec)
l <- seq_len(n_models) |> as.list()
names <- spec[, ".name", drop = TRUE]
for (i in seq_len(n_models)) {
if (i == 1) next()
# the last item does not need the length of longest variable
# as it is constant here
ll_var <- if (i != n_models) pred_var_max_length else character(0)
lhs_var <- names[i]
rhs_vars <- names[seq(1, i - 1)] |>
c(ll_var) |>
paste(collapse = " + ")
l[[i]] <- rlang::new_formula(
lhs = rlang::sym(lhs_var),
rhs = rlang::parse_expr(rhs_vars)
)
}
names(l) <- names
l[[1]] <- NULL
l
}
create_formulas(spec, pred_var_max_length) |>
lapply(stats::lm, data = data)
}
chr_var_y <- rlang::englue("{{ var_y }}")
chr_var_length <- rlang::englue("{{ var_length }}")
chr_var_max_length <- rlang::englue(".max_{{ var_length }}")
chr_var_y_imputed <- rlang::englue("{{ var_y }}_imputed")
chr_var_y_imputation <- rlang::englue("{{ var_y }}_imputation")
# Create a wide dataframe with columns y_1, y_2, y_3, etc. so that
# we can do models likes y_3 ~ y_2 + y_1.
spec <- tidyr::build_wider_spec(
data,
names_from = {{ var_length }},
names_prefix = "y_",
values_from = {{ var_y }}
)
data_wide <- tidyr::pivot_wider_spec(data, spec, id_cols = {{ id_cols }})
# Store the longest length value without missing data
data_wide[[chr_var_max_length]] <- data_wide |>
dplyr::select(dplyr::all_of(spec$.name)) |>
apply(1, get_position_of_last_non_na_value)
# Set up the dataset for training the models if they are provided
if (is.null(data_train)) {
data_train <- data
data_wide_train <- data_wide
} else {
data_wide_train <- tidyr::pivot_wider_spec(
data_train,
spec = spec,
id_cols = {{ id_cols }}
)
data_wide_train[[chr_var_max_length]] <- data_wide_train |>
dplyr::select(dplyr::all_of(spec$.name)) |>
apply(1, get_position_of_last_non_na_value)
}
# Train the imputation model
if (!include_max_length) {
pred_var_max_length <- character(0)
} else {
pred_var_max_length <- chr_var_max_length
}
models <- fit_imputation_models(data_wide_train, spec, pred_var_max_length)
# Impute. Loop so that imputed y_i can be used as predictor for a later y_j
data_imputed <- data_wide
for (y_name in names(models)) {
vals <- data_imputed[[y_name]]
data_imputed[[y_name]] <- ifelse(
is.na(vals),
stats::predict(models[[y_name]], data_imputed),
vals
)
}
# Reshape back to long.
data_imputed_long <- data_imputed |>
tidyr::pivot_longer_spec(spec) |>
dplyr::rename(!! chr_var_y_imputed := {{ var_y }})
data_original_values <- data_wide |>
tidyr::pivot_longer_spec(spec)
join_set <- data_imputed_long |>
dplyr::select(
{{ id_cols }},
tidyselect::all_of(c(chr_var_length, chr_var_max_length))
) |>
colnames()
data_both <- data_original_values |>
dplyr::left_join(data_imputed_long, join_set)
data_both[[chr_var_y_imputation]] <- ifelse(
is.na(data_both[[chr_var_y]]) & !is.na(data_both[[chr_var_y_imputed]]),
"imputed",
"observed"
)
data_both
}
#' Weight utterance lengths by using an ordinal regression model
#'
#' For each participant, we find their length of longest utterance. We predict
#' this longest utterance length as a nonlinear function of some variable, and
#' we compute the probability of reaching each utterance length at each value of
#' the predictor variable. These probabilities are then normalized to provide
#' weights for each utterance length.
#'
#' @param data_train dataframe used to train the ordinal model
#' @param var_length bare name of the length variable. For example,
#' `tocs_level`.
#' @param var_x bare name of the predictor variable. For example, `age_months`.
#' @param id_cols a selection of variable names that uniquely identify each
#' group of related observations. For example, `c(child_id, age_months)`.
#' @param spline_df number of degrees of freedom to use for the ordinal
#' regression model.
#' @param data_join (optional) dataset to use join the weights onto. This
#' feature is necessary because we want to train a dataset on the observed
#' data but supply the weights to the dataset with missing values imputed.
#' @return the probability and weights of each utterance length at each observed
#' value of `var_x`. These are in the added columns
#' `{var_length}_prob_reached` and `{var_length}_weight`, respectively.
#' @export
#' @references Hustad, K. C., Mahr, T., Natzke, P. E. M., & Rathouz, P. J.
#' (2020). Development of Speech Intelligibility Between 30 and 47 Months in
#' Typically Developing Children: A Cross-Sectional Study of Growth. *Journal
#' of Speech, Language, and Hearing Research*, *63*(6), 1675–1687.
#' https://doi.org/10.1044/2020_JSLHR-20-00008
#'
#' Hustad, K. C., Mahr, T., Natzke, P. E. M., & J. Rathouz, P. (2020).
#' Supplemental Material S1 (Hustad et al., 2020). ASHA journals.
#' https://doi.org/10.23641/asha.12330956.v1
#'
#' @examples
#' data_weights <- weight_lengths_with_ordinal_model(
#' data_example_intelligibility_by_length,
#' tocs_level,
#' age_months,
#' child,
#' spline_df = 2
#' )
#'
#' if (requireNamespace("ggplot2")) {
#' library(ggplot2)
#' p1 <- ggplot(data_weights) +
#' aes(x = age_months, y = tocs_level_prob_reached) +
#' geom_line(aes(color = ordered(tocs_level)), linewidth = 1) +
#' scale_color_ordinal(end = .85) +
#' labs(y = "Prob. of reaching length", color = "Utterance length")
#' print(p1)
#'
#' p2 <- p1 +
#' aes(y = tocs_level_weight) +
#' labs(y = "Weight of utterance length")
#' print(p2)
#' }
weight_lengths_with_ordinal_model <- function(
data_train,
var_length,
var_x,
id_cols,
spline_df = 2,
data_join = NULL
) {
chr_var_x <- rlang::englue("{{ var_x }}")
chr_var_length <- rlang::englue("{{ var_length }}")
chr_var_longest_length <- rlang::englue(".longest_{{ var_length }}")
chr_cols_to_select <- c(chr_var_length, chr_var_x, chr_var_longest_length)
chr_var_longest_length_ord <- rlang::englue(".longest_{{ var_length }}_ord")
chr_var_length_prob <- rlang::englue("{{ var_length }}_prob_reached")
chr_var_length_weight <- rlang::englue("{{ var_length }}_weight")
expr_x <- rlang::enexpr(var_x)
data_model <- data_train |>
dplyr::group_by(dplyr::pick({{ id_cols }})) |>
dplyr::mutate(
"{chr_var_longest_length}" := max({{ var_length }}, na.rm = TRUE)
) |>
dplyr::ungroup() |>
dplyr::distinct(
dplyr::pick({{ id_cols }}, dplyr::all_of(chr_cols_to_select))
) |>
dplyr::mutate(
"{chr_var_longest_length_ord}" := ordered(.data[[chr_var_longest_length]])
) |>
dplyr::filter(
{{ var_length }} == .data[[chr_var_longest_length]]
)
f <- rlang::new_formula(
lhs = rlang::sym(chr_var_longest_length_ord),
rhs = rlang::expr(splines::ns(!! expr_x, df = !! spline_df))
)
model <- ordinal::clm(f, data = data_model)
data_x <- data_train |>
dplyr::distinct({{ var_x }})
# Each length gets a probability so we have to reshape from a wide matrix
# into a long dataframe
data_probs <- stats::predict(model, data_x, type = "prob") |>
getElement("fit") |>
dplyr::bind_cols(data_x) |>
tidyr::pivot_longer(
cols = -dplyr::all_of(names(data_x)),
names_to = chr_var_length,
names_transform = as.integer,
values_to = chr_var_length_prob
) |>
# Normalize probabilities at each x (age)
dplyr::group_by({{ var_x }}) |>
dplyr::arrange(desc({{ var_length }})) |>
dplyr::mutate(
"{chr_var_length_prob}" := cumsum(.data[[chr_var_length_prob]]),
"{chr_var_length_weight}" :=
.data[[chr_var_length_prob]] / sum(.data[[chr_var_length_prob]])
) |>
dplyr::ungroup()
if (!is.null(data_join)) {
data_join |>
dplyr::left_join(data_probs, by = c(chr_var_length, chr_var_x))
} else {
data_probs
}
}