/
rvest-package.R
67 lines (61 loc) · 2.08 KB
/
rvest-package.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#' @keywords internal
#' @import rlang
#' @importFrom lifecycle deprecated
"_PACKAGE"
#' Static web scraping (with xml2)
#'
#' @description
#' [read_html()] works by performing a HTTP request then parsing the HTML
#' received using the xml2 package. This is "static" scraping because it
#' operates only on the raw HTML file. While this works for most sites,
#' in some cases you will need to use [read_html_live()] if the parts of
#' the page you want to scrape are dynamically generated with javascript.
#'
#' Generally, we recommend using `read_html()` if it works, as it will be
#' faster and more robust, as it has fewer external dependencies (i.e. it
#' doesn't rely on the Chrome web browser installed on your computer.)
#'
#' @inheritParams xml2::read_html
#' @param x Usually a string representing a URL. See [xml2::read_html()] for
#' other options.
#' @rdname read_html
#' @importFrom xml2 read_html
#' @export
#' @examples
#' # Start by reading a HTML page with read_html():
#' starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html")
#'
#' # Then find elements that match a css selector or XPath expression
#' # using html_elements(). In this example, each <section> corresponds
#' # to a different film
#' films <- starwars %>% html_elements("section")
#' films
#'
#' # Then use html_element() to extract one element per film. Here
#' # we the title is given by the text inside <h2>
#' title <- films %>%
#' html_element("h2") %>%
#' html_text2()
#' title
#'
#' # Or use html_attr() to get data out of attributes. html_attr() always
#' # returns a string so we convert it to an integer using a readr function
#' episode <- films %>%
#' html_element("h2") %>%
#' html_attr("data-id") %>%
#' readr::parse_integer()
#' episode
xml2::read_html
#' @importFrom xml2 url_absolute
#' @export
xml2::url_absolute
#' @export
#' @importFrom magrittr %>%
magrittr::`%>%`
# The following block is used by usethis to automatically manage
# roxygen namespace tags. Modify with care!
## usethis namespace: start
#' @importFrom glue glue
## usethis namespace: end
NULL
the <- new_environment()