Skip to content

Commit

Permalink
Rename html_node(s) to html_element(s)
Browse files Browse the repository at this point in the history
Fixes #298
  • Loading branch information
hadley committed Jan 6, 2021
1 parent 9825ff6 commit 6fe2c45
Show file tree
Hide file tree
Showing 34 changed files with 354 additions and 333 deletions.
10 changes: 6 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
S3method(cookies,rvest_session)
S3method(format,rvest_field)
S3method(headers,rvest_session)
S3method(html_element,default)
S3method(html_element,rvest_session)
S3method(html_elements,default)
S3method(html_elements,rvest_session)
S3method(html_form,rvest_session)
S3method(html_form,xml_document)
S3method(html_form,xml_node)
S3method(html_form,xml_nodeset)
S3method(html_node,default)
S3method(html_node,rvest_session)
S3method(html_nodes,default)
S3method(html_nodes,rvest_session)
S3method(html_table,rvest_session)
S3method(html_table,xml_document)
S3method(html_table,xml_node)
Expand All @@ -33,6 +33,8 @@ export(guess_encoding)
export(html_attr)
export(html_attrs)
export(html_children)
export(html_element)
export(html_elements)
export(html_encoding_guess)
export(html_form)
export(html_form_set)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# rvest (development version)

* `html_node()` and `html_nodes()` have been superseded in favour of
`html_element()` and `html_elements()` since they (almost) always return
elements, not nodes (#298).

* `html_table()` gains `na.strings` argument to control what values are
converted to `NA` (#107).

Expand Down
6 changes: 3 additions & 3 deletions R/encoding.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
#' # A file with bad encoding included in the package
#' path <- system.file("html-ex", "bad-encoding.html", package = "rvest")
#' x <- read_html(path)
#' x %>% html_nodes("p") %>% html_text()
#' x %>% html_elements("p") %>% html_text()
#'
#' html_encoding_guess(x)
#' # Two valid encodings, only one of which is correct
#' read_html(path, encoding = "ISO-8859-1") %>% html_nodes("p") %>% html_text()
#' read_html(path, encoding = "ISO-8859-2") %>% html_nodes("p") %>% html_text()
#' read_html(path, encoding = "ISO-8859-1") %>% html_elements("p") %>% html_text()
#' read_html(path, encoding = "ISO-8859-2") %>% html_elements("p") %>% html_text()
html_encoding_guess <- function(x) {
check_installed("stringi")

Expand Down
4 changes: 2 additions & 2 deletions R/form.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ html_form.xml_node <- function(x) {
method <- toupper(attr$method %||% "GET")
enctype <- convert_enctype(attr$enctype)

nodes <- html_nodes(x, "input, select, textarea, button")
nodes <- html_elements(x, "input, select, textarea, button")
fields <- lapply(nodes, function(x) {
switch(xml2::xml_name(x),
textarea = parse_textarea(x),
Expand Down Expand Up @@ -147,7 +147,7 @@ parse_input <- function(x) {

parse_select <- function(x) {
attr <- as.list(xml2::xml_attrs(x))
options <- parse_options(html_nodes(x, "option"))
options <- parse_options(html_elements(x, "option"))

rvest_field(
type = "select",
Expand Down
10 changes: 5 additions & 5 deletions R/html.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' Get name and attributes from nodes
#' Get name and attributes from elements
#'
#' `html_name()` gets the tag name, `html_attr()` gets a single attribute,
#' and `html_attr()` gets all attributes.
Expand All @@ -10,7 +10,7 @@
#' @examples
#' url <- "https://en.wikipedia.org/wiki/The_Lego_Movie"
#' movie <- read_html(url)
#' cast <- html_nodes(movie, "tr:nth-child(8) .plainlist a")
#' cast <- html_elements(movie, "tr:nth-child(8) .plainlist a")
#'
#' html_name(cast)
#' html_attrs(cast)
Expand All @@ -27,7 +27,7 @@ html_name <- function(x) {
#' @rdname html_name
#' @param name Name of attribute to retrieve.
#' @param default A string used as a default value when the attribute does
#' not exist in every node.
#' not exist in every element.
#' @export
#' @importFrom xml2 xml_attr
html_attr <- function(x, name, default = NA_character_) {
Expand All @@ -47,11 +47,11 @@ html_attrs <- function(x) {
#' @inheritParams xml2::xml_text
#' @examples
#' html <- minimal_html("<ul><li>1<li>2<li>3</ul>")
#' ul <- html_nodes(html, "ul")
#' ul <- html_elements(html, "ul")
#' html_children(ul)
#'
#' html <- minimal_html("<p>Hello <b>Hadley</b><i>!</i>")
#' p <- html_nodes(html, "p")
#' p <- html_elements(html, "p")
#' html_children(p)
#' @importFrom xml2 xml_children
html_children <- function(x) {
Expand Down
23 changes: 19 additions & 4 deletions R/rename.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@
#' * `set_values()` -> `html_form_set()`
#' * `submit_form()` -> `session_submit()`
#' * `xml_tag()` -> `html_name()`
#' * `xml_node()` -> `html_node()`
#' * `xml_nodes()` -> `html_nodes()`
#' * `xml_node()` & `html_node()` -> `html_element()`
#' * `xml_nodes()` & `html_nodes()` -> `html_element()`
#'
#' (`html_node()` and `html_nodes()` are only superseded because they're
#' so widely used.)
#'
#' @keywords internal
#' @name rename
Expand Down Expand Up @@ -42,13 +45,25 @@ xml_tag <- function(x) {
#' @export
#' @rdname rename
xml_node <- function(...) {
lifecycle::deprecate_warn("1.0.0", "xml_node()", "html_node()")
lifecycle::deprecate_warn("1.0.0", "xml_node()", "html_element()")
html_node(...)
}

#' @export
#' @rdname rename
xml_nodes <- function(...) {
lifecycle::deprecate_warn("1.0.0", "xml_nodes()", "html_nodes()")
lifecycle::deprecate_warn("1.0.0", "xml_nodes()", "html_elements()")
html_nodes(...)
}

#' @export
#' @rdname rename
html_nodes <- function(...) {
html_elements(...)
}

#' @export
#' @rdname rename
html_node <- function(...) {
html_element(...)
}
70 changes: 35 additions & 35 deletions R/selectors.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
#' Select nodes from an HTML document
#' Select elements from an HTML document
#'
#' `html_node()` and `html_nodes()` find HTML tags (nodes) using CSS selectors
#' or XPath expressions.
#'
#' CSS selectors are particularly useful in conjunction with
#' <https://selectorgadget.com/>, which makes it very easy to discover the
#' selector you need. If you haven't used CSS selectors before, I'd recommend
#' starting with the the fun tutorial at <http://flukeout.github.io/>.
#' `html_element()` and `html_elements()` find HTML element using CSS selectors
#' or XPath expressions. CSS selectors are particularly useful in conjunction
#' with <https://selectorgadget.com/>, which makes it very easy to discover the
#' selector you need.
#'
#' @section CSS selector support:
#'
Expand All @@ -29,10 +26,11 @@
#' simple selector.
#'
#' @param x Either a document, a node set or a single node.
#' @param css,xpath Nodes to select. Supply one of `css` or `xpath`
#' depending on whether you want to use a CSS or XPath 1.0 selector.
#' @returns `html_node()` returns a nodeset the same length as the input.
#' `html_nodes()` flattens the output so there's no direct way to map
#' @param css,xpath Elements to select. Supply one of `css` or `xpath`
#' depending on whether you want to use a CSS selector or XPath 1.0
#' expression.
#' @returns `html_element()` returns a nodeset the same length as the input.
#' `html_elements()` flattens the output so there's no direct way to map
#' the output to the input.
#' @export
#' @examples
Expand All @@ -41,44 +39,46 @@
#' "https://www.boxofficemojo.com/movies/?id=ateam.htm"
#' )
#' ateam <- read_html(url)
#' html_nodes(ateam, "center")
#' html_nodes(ateam, "center font")
#' html_nodes(ateam, "center font b")
#' html_elements(ateam, "center")
#' html_elements(ateam, "center font")
#' html_elements(ateam, "center font b")
#'
#' # html_nodes() well suited to use with the pipe
#' ateam %>% html_nodes("center") %>% html_nodes("td")
#' ateam %>% html_nodes("center") %>% html_nodes("font")
#' ateam %>% html_elements("center") %>% html_elements("td")
#' ateam %>% html_elements("center") %>% html_elements("font")
#'
#' td <- ateam %>% html_nodes("center") %>% html_nodes("td")
#' td <- ateam %>% html_elements("center") %>% html_elements("td")
#' td
#' # When applied to a list of nodes, html_nodes() returns all matching nodes
#' # beneath any of the elements, flattening results into a new nodelist.
#' td %>% html_nodes("font")
#'
#' # html_node() returns the first matching node. If there are no matching
#' # nodes, it returns a "missing" node
#' td %>% html_node("font")
#' # When applied to a node set, html_elements() returns all matching elements
#' # beneath any of the inputs, flattening results into a new node set.
#' td %>% html_elements("font")
#'
#' # html_element() returns the first matching element. If there are no matching
#' # nodes, it returns a "missing" element
#' td %>% html_element("font")
#' # and html_text() and html_attr() will return NA
#' td %>% html_element("font") %>% html_text()
#'
#' # To pick out an element or elements at specified positions, use [[ and [
#' ateam %>% html_nodes("table") %>% .[[1]] %>% html_nodes("img")
#' ateam %>% html_nodes("table") %>% .[1:2] %>% html_nodes("img")
html_nodes <- function(x, css, xpath) {
UseMethod("html_nodes")
#' ateam %>% html_elements("table") %>% .[[1]] %>% html_elements("img")
#' ateam %>% html_elements("table") %>% .[1:2] %>% html_elements("img")
html_element <- function(x, css, xpath) {
UseMethod("html_element")
}

#' @export
html_nodes.default <- function(x, css, xpath) {
xml2::xml_find_all(x, make_selector(css, xpath))
#' @rdname html_element
html_elements <- function(x, css, xpath) {
UseMethod("html_elements")
}

#' @export
#' @rdname html_nodes
html_node <- function(x, css, xpath) {
UseMethod("html_node")
html_elements.default <- function(x, css, xpath) {
xml2::xml_find_all(x, make_selector(css, xpath))
}

#' @export
html_node.default <- function(x, css, xpath) {
html_element.default <- function(x, css, xpath) {
xml2::xml_find_first(x, make_selector(css, xpath))
}

Expand Down
18 changes: 9 additions & 9 deletions R/session.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' * Submit an [html_form] with `session_submit()`.
#' * View the history with `session_history()` and navigate back and forward
#' with `back()` and `forward()`.
#' * Extract page contents with [html_node()] and [html_nodes()], or get the
#' * Extract page contents with [html_element()] and [html_elements()], or get the
#' complete HTML document with [read_html()].
#' * Inspect the HTTP response with [httr::cookies()], [httr::headers()],
#' and [httr::status_code()].
Expand All @@ -35,7 +35,7 @@
#' \donttest{
#' s %>%
#' follow_link(css = "p a") %>%
#' html_nodes("p")
#' html_elements("p")
#' }
html_session <- function(url, ...) {
session <- structure(
Expand Down Expand Up @@ -97,7 +97,7 @@ jump_to <- function(x, url, ...) {

#' @param i A integer to select the ith link or a string to match the
#' first link containing that text (case sensitive).
#' @inheritParams html_node
#' @inheritParams html_element
#' @export
#' @rdname html_session
follow_link <- function(x, i, css, xpath, ...) {
Expand All @@ -115,7 +115,7 @@ find_href <- function(x, i, css, xpath) {

if (!missing(i)) {
stopifnot(length(i) == 1)
a <- html_nodes(x, "a")
a <- html_elements(x, "a")

if (is.numeric(i)) {
out <- a[[i]]
Expand All @@ -131,7 +131,7 @@ find_href <- function(x, i, css, xpath) {
abort("`i` must a string or integer")
}
} else {
a <- html_nodes(x, css = css, xpath = xpath)
a <- html_elements(x, css = css, xpath = xpath)
if (length(a) == 0) {
abort("No links matched `css`/`xpath`")
}
Expand Down Expand Up @@ -334,13 +334,13 @@ html_table.rvest_session <- function(x,
}

#' @export
html_node.rvest_session <- function(x, css, xpath) {
html_node(read_html(x), css, xpath)
html_element.rvest_session <- function(x, css, xpath) {
html_element(read_html(x), css, xpath)
}

#' @export
html_nodes.rvest_session <- function(x, css, xpath) {
html_nodes(read_html(x), css, xpath)
html_elements.rvest_session <- function(x, css, xpath) {
html_elements(read_html(x), css, xpath)
}

# httr methods -----------------------------------------------------------------
Expand Down
6 changes: 3 additions & 3 deletions R/table.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#' <tr><td>10</td><td>z</td></tr>
#' </table>")
#' sample1 %>%
#' html_node("table") %>%
#' html_element("table") %>%
#' html_table()
#'
#' # Values in merged cells will be duplicated
Expand All @@ -33,7 +33,7 @@
#' <tr><td>6</td><td colspan='2'>7</td></tr>
#' </table>")
#' sample2 %>%
#' html_node("table") %>%
#' html_element("table") %>%
#' html_table()
#'
#' # If a row is missing cells, they'll be filled with NAs
Expand All @@ -44,7 +44,7 @@
#' <tr><td>4</td></tr>
#' </table>")
#' sample3 %>%
#' html_node("table") %>%
#' html_element("table") %>%
#' html_table()
html_table <- function(x,
header = NA,
Expand Down
20 changes: 10 additions & 10 deletions R/text.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#' Get text from nodes
#' Get text from elements
#'
#' @description
#' There are two ways to retrieve text from a node: `html_text()` and
#' There are two ways to retrieve text from a element: `html_text()` and
#' `html_text2()`. `html_text()` is a thin wrapper around [xml2::xml_text()]
#' which returns just the text nodes. `html_text2()` simulates how text looks
#' in a browser, using an approach inspired by the javascript
#' [innerText](https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText)
#' function. Roughly speaking, it converts `<br />` to `"\n"`, adds blank lines
#' which returns just the raw underlying text. `html_text2()` simulates how
#' text looks in a browser, using an approach inspired by javascript's
#' [innerText()](https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText).
#' Roughly speaking, it converts `<br />` to `"\n"`, adds blank lines
#' around `<p>` tags, and lightly formats tabular data.
#'
#' `html_text2()` is usually what you want, but it is much slower than
Expand All @@ -27,17 +27,17 @@
#'
#' # html_text() returns the raw underlying text, which includes whitespace
#' # that would be ignored by a browser, and ignores the <br>
#' html %>% html_node("p") %>% html_text() %>% writeLines()
#' html %>% html_element("p") %>% html_text() %>% writeLines()
#'
#' # html_text2() simulates what a browser would display. Non-significant
#' # whitespace is collapsed, and <br> is turned into a line break
#' html %>% html_node("p") %>% html_text2() %>% writeLines()
#' html %>% html_element("p") %>% html_text2() %>% writeLines()
#'
#' # By default, html_text2() also converts non-breaking spaces to regular
#' # spaces:
#' html <- minimal_html("<p>x&nbsp;y</p>")
#' x1 <- html %>% html_node("p") %>% html_text()
#' x2 <- html %>% html_node("p") %>% html_text2()
#' x1 <- html %>% html_element("p") %>% html_text()
#' x2 <- html %>% html_element("p") %>% html_text2()
#'
#' # When printed, non-breaking spaces look exactly like regular spaces
#' x1
Expand Down
Loading

0 comments on commit 6fe2c45

Please sign in to comment.