/
retrieve_data.R
132 lines (119 loc) · 5.24 KB
/
retrieve_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#' Retrieve data from BacDive
#'
#' @param searchTerm Mandatory character string (in case of the `searchType`
#' `sequence`, `culturecollectionno` or `taxon`) or integer (in case of
#' `bacdive_id`), specifying what shall be searched for.
#'
#' @param searchType Mandatory character string that specifies which type of
#' search will be performed (technically, which API endpoint). Can be
#' `bacdive_id` (default), `sequence`, `culturecollectionno` or `taxon`.
#'
#' @param force_search Logical; default: `FALSE`. Whether or not the searchType
#' should be enforced strictly, even if it appears to mismatch the searchTerm.
#' Please note: forcing an apparently mismatched searchType will most likely
#' result in an error: `retrieve_data(searchTerm = "DSM 319", searchType =
#' "bacdive_id", force_search = TRUE)` without specifying `searchType =
#' "sequence"` should lead to an internal re-specification, and execution of
#' the intended search.
#'
#' @param force_taxon_download Logical; default: `FALSE`. In case of a taxon
#' search, BacDive will return not the actual data of the search results, but
#' only a paged list of URLs pointing to the actual datasets. Setting
#' `force_taxon_download = TRUE` (default: `FALSE`) triggers many downloads of
#' the individual result datasets. Please note: This may take much longer than
#' an unambiguous search, and may cause R(Studio) to be unresponsive
#' temporarily Maybe go for a walk for a few minutes ;-)
#'
#' @return EITHER (from an unambiguous searchTerm, or in case of
#' `force_taxon_download = TRUE`) a list of lists containing the single
#' BacDive dataset for that `searchTerm`,
#'
#' OR (from a _am_biguous search; eg.g `serchType = "taxon"`) a numeric vector
#' of BacDive-IDs, on which you can call your own loop containing
#' `retrieve_data()` to retrieve the individual data sets.
#'
#' @export
#' @examples retrieve_data(searchTerm = 717)
#' retrieve_data(searchTerm = "AJ000733", searchType = "sequence")
#' retrieve_data(searchTerm = "DSM 319", "culturecollectionno")
#' retrieve_data("Bacillus subtilis", searchType = "taxon")
#' retrieve_data("Bacillus subtilis subtilis", searchType = "taxon", force_taxon_download = TRUE)
retrieve_data <- function(searchTerm,
searchType = "bacdive_id",
force_search = FALSE,
force_taxon_download = FALSE) {
x <-
rjson::fromJSON(download(construct_url(searchTerm, searchType, force_search)))
if (force_taxon_download && x$count > 100) warn_slow_download(x$count)
if (identical(names(x), c("count", "next", "previous", "results")) &&
!force_taxon_download) {
return(aggregate_result_IDs(x))
} else if (identical(names(x), c("count", "next", "previous", "results")) &&
force_taxon_download) {
taxon_data <- list()
URLs <- aggregate_result_URLs(x)
for (u in URLs) {
taxon_data <- c(taxon_data,
rjson::fromJSON(download(paste0(u, "?format=json"))))
}
return(taxon_data)
} else if (is.list(x) && length(x) == 1) {
# repeat download, if API returned a single ID, instead of a full dataset
x <- rjson::fromJSON(download(paste0(x[[1]][1]$url, "?format=json")))
return(x)
} else if (identical(x$detail, "Not found")) {
stop(
"Your search returned no result, sorry. Please make sure that you provided a searchTerm, and specified the correct searchType. Please type '?retrieve_data' and read through the 'searchType' section to learn more."
)
} else {
return(x)
}
}
#' Download Something from BacDive
#'
#' @param URL represented by a correctly encoded character string with spaces,
#' thanks to utils::URLencode.
#' @param userpwd A character string of the format
#' `BacDive_email:BacDive_password`. Retrieved from `.Renviron` my default,
#' but also used with something else by the tests.
#'
#' @return A serialised JSON string.
download <- function(URL, userpwd = paste(get_credentials(), collapse = ":")) {
RCurl::getURL(
URL,
userpwd = userpwd,
httpauth = 1L
)
}
#' Aggregate BacDive-IDs from a Paged List of Retrieved URLs
#'
#' @param results A list of paginated URLs returned by an ambiguous
#' `searchTerm` in `retrieve_data()`
#'
#' @return An integer vector of all BacDive IDs within the results.
aggregate_result_IDs <- function(results) {
IDs <- as.numeric(sapply(strsplit(
x = aggregate_result_URLs(results), split = "/"
), function(x)
x[7]))
# IDs are the 7th part in the URls returned by an ambiguous `searchTerm`
# e.g. https://bacdive.dsmz.de/api/bacdive/bacdive_id/138982/
# => [1] "https:" "" "bacdive.dsmz.de" "api"
# => [5] "bacdive" "bacdive_id" "138982
return(IDs)
}
#' Aggregate BacDive-URLs from a Paged List of Retrieved URLs
#'
#' @param results A list of paginated URLs returned by an ambiguous
#' `searchTerm` in `retrieve_data()`.
#'
#' @return An integer vector of all BacDive IDs within the results.
aggregate_result_URLs <- function(results) {
URLs <- c()
for (i in 1:ceiling(x = results$count/100)) {
URLs <- c(URLs, unlist(results$results, use.names = FALSE))
if (is.null(results$`next`)) break
results <- rjson::fromJSON(download(results$`next`))
}
return(URLs)
}