Rename html_node(s) to html_element(s)

Fixes #298
tidyverse · Jan 6, 2021 · 6fe2c45 · 6fe2c45
1 parent 9825ff6
commit 6fe2c45
Show file tree

Hide file tree

Showing 34 changed files with 354 additions and 333 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,14 +3,14 @@
 S3method(cookies,rvest_session)
 S3method(format,rvest_field)
 S3method(headers,rvest_session)
+S3method(html_element,default)
+S3method(html_element,rvest_session)
+S3method(html_elements,default)
+S3method(html_elements,rvest_session)
 S3method(html_form,rvest_session)
 S3method(html_form,xml_document)
 S3method(html_form,xml_node)
 S3method(html_form,xml_nodeset)
-S3method(html_node,default)
-S3method(html_node,rvest_session)
-S3method(html_nodes,default)
-S3method(html_nodes,rvest_session)
 S3method(html_table,rvest_session)
 S3method(html_table,xml_document)
 S3method(html_table,xml_node)
@@ -33,6 +33,8 @@ export(guess_encoding)
 export(html_attr)
 export(html_attrs)
 export(html_children)
+export(html_element)
+export(html_elements)
 export(html_encoding_guess)
 export(html_form)
 export(html_form_set)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # rvest (development version)
 
+* `html_node()` and `html_nodes()` have been superseded in favour of
+  `html_element()`  and `html_elements()` since they (almost) always return 
+  elements, not nodes (#298). 
+
 * `html_table()` gains `na.strings` argument to control what values are 
   converted to `NA` (#107).
 

diff --git a/R/encoding.R b/R/encoding.R
@@ -11,12 +11,12 @@
 #' # A file with bad encoding included in the package
 #' path <- system.file("html-ex", "bad-encoding.html", package = "rvest")
 #' x <- read_html(path)
-#' x %>% html_nodes("p") %>% html_text()
+#' x %>% html_elements("p") %>% html_text()
 #'
 #' html_encoding_guess(x)
 #' # Two valid encodings, only one of which is correct
-#' read_html(path, encoding = "ISO-8859-1") %>% html_nodes("p") %>% html_text()
-#' read_html(path, encoding = "ISO-8859-2") %>% html_nodes("p") %>% html_text()
+#' read_html(path, encoding = "ISO-8859-1") %>% html_elements("p") %>% html_text()
+#' read_html(path, encoding = "ISO-8859-2") %>% html_elements("p") %>% html_text()
 html_encoding_guess <- function(x) {
   check_installed("stringi")
 

diff --git a/R/form.R b/R/form.R
@@ -43,7 +43,7 @@ html_form.xml_node <- function(x) {
   method <- toupper(attr$method %||% "GET")
   enctype <- convert_enctype(attr$enctype)
 
-  nodes <- html_nodes(x, "input, select, textarea, button")
+  nodes <- html_elements(x, "input, select, textarea, button")
   fields <- lapply(nodes, function(x) {
     switch(xml2::xml_name(x),
       textarea = parse_textarea(x),
@@ -147,7 +147,7 @@ parse_input <- function(x) {
 
 parse_select <- function(x) {
   attr <- as.list(xml2::xml_attrs(x))
-  options <- parse_options(html_nodes(x, "option"))
+  options <- parse_options(html_elements(x, "option"))
 
   rvest_field(
     type = "select",

diff --git a/R/html.R b/R/html.R
@@ -1,4 +1,4 @@
-#' Get name and attributes from nodes
+#' Get name and attributes from elements
 #'
 #' `html_name()` gets the tag name, `html_attr()` gets a single attribute,
 #' and `html_attr()` gets all attributes.
@@ -10,7 +10,7 @@
 #' @examples
 #' url <- "https://en.wikipedia.org/wiki/The_Lego_Movie"
 #' movie <- read_html(url)
-#' cast <- html_nodes(movie, "tr:nth-child(8) .plainlist a")
+#' cast <- html_elements(movie, "tr:nth-child(8) .plainlist a")
 #'
 #' html_name(cast)
 #' html_attrs(cast)
@@ -27,7 +27,7 @@ html_name <- function(x) {
 #' @rdname html_name
 #' @param name Name of attribute to retrieve.
 #' @param default A string used as a default value when the attribute does
-#'   not exist in every node.
+#'   not exist in every element.
 #' @export
 #' @importFrom xml2 xml_attr
 html_attr <- function(x, name, default = NA_character_) {
@@ -47,11 +47,11 @@ html_attrs <- function(x) {
 #' @inheritParams xml2::xml_text
 #' @examples
 #' html <- minimal_html("<ul><li>1<li>2<li>3</ul>")
-#' ul <- html_nodes(html, "ul")
+#' ul <- html_elements(html, "ul")
 #' html_children(ul)
 #'
 #' html <- minimal_html("<p>Hello <b>Hadley</b><i>!</i>")
-#' p <- html_nodes(html, "p")
+#' p <- html_elements(html, "p")
 #' html_children(p)
 #' @importFrom xml2 xml_children
 html_children <- function(x) {

diff --git a/R/rename.R b/R/rename.R
@@ -10,8 +10,11 @@
 #' * `set_values()` -> `html_form_set()`
 #' * `submit_form()` -> `session_submit()`
 #' * `xml_tag()` -> `html_name()`
-#' * `xml_node()` -> `html_node()`
-#' * `xml_nodes()` -> `html_nodes()`
+#' * `xml_node()` & `html_node()` -> `html_element()`
+#' * `xml_nodes()` & `html_nodes()` -> `html_element()`
+#'
+#' (`html_node()` and `html_nodes()` are only superseded because they're
+#' so widely used.)
 #'
 #' @keywords internal
 #' @name rename
@@ -42,13 +45,25 @@ xml_tag <- function(x) {
 #' @export
 #' @rdname rename
 xml_node <- function(...) {
-  lifecycle::deprecate_warn("1.0.0", "xml_node()", "html_node()")
+  lifecycle::deprecate_warn("1.0.0", "xml_node()", "html_element()")
   html_node(...)
 }
 
 #' @export
 #' @rdname rename
 xml_nodes <- function(...) {
-  lifecycle::deprecate_warn("1.0.0", "xml_nodes()", "html_nodes()")
+  lifecycle::deprecate_warn("1.0.0", "xml_nodes()", "html_elements()")
   html_nodes(...)
 }
+
+#' @export
+#' @rdname rename
+html_nodes <- function(...) {
+  html_elements(...)
+}
+
+#' @export
+#' @rdname rename
+html_node <- function(...) {
+  html_element(...)
+}
diff --git a/R/selectors.R b/R/selectors.R
@@ -1,12 +1,9 @@
-#' Select nodes from an HTML document
+#' Select elements from an HTML document
 #'
-#' `html_node()` and `html_nodes()` find HTML tags (nodes) using CSS selectors
-#' or XPath expressions.
-#'
-#' CSS selectors are particularly useful in conjunction with
-#' <https://selectorgadget.com/>, which makes it very easy to discover the
-#' selector you need. If you haven't used CSS selectors before, I'd recommend
-#' starting with the the fun tutorial at <http://flukeout.github.io/>.
+#' `html_element()` and `html_elements()` find HTML element using CSS selectors
+#' or XPath expressions. CSS selectors are particularly useful in conjunction
+#' with <https://selectorgadget.com/>, which makes it very easy to discover the
+#' selector you need.
 #'
 #' @section CSS selector support:
 #'
@@ -29,10 +26,11 @@
 #'   simple selector.
 #'
 #' @param x Either a document, a node set or a single node.
-#' @param css,xpath Nodes to select. Supply one of `css` or `xpath`
-#'   depending on whether you want to use a CSS or XPath 1.0 selector.
-#' @returns `html_node()` returns a nodeset the same length as the input.
-#'   `html_nodes()` flattens the output so there's no direct way to map
+#' @param css,xpath Elements to select. Supply one of `css` or `xpath`
+#'   depending on whether you want to use a CSS selector or XPath 1.0
+#'   expression.
+#' @returns `html_element()` returns a nodeset the same length as the input.
+#'   `html_elements()` flattens the output so there's no direct way to map
 #'   the output to the input.
 #' @export
 #' @examples
@@ -41,44 +39,46 @@
 #'   "https://www.boxofficemojo.com/movies/?id=ateam.htm"
 #' )
 #' ateam <- read_html(url)
-#' html_nodes(ateam, "center")
-#' html_nodes(ateam, "center font")
-#' html_nodes(ateam, "center font b")
+#' html_elements(ateam, "center")
+#' html_elements(ateam, "center font")
+#' html_elements(ateam, "center font b")
 #'
-#' # html_nodes() well suited to use with the pipe
-#' ateam %>% html_nodes("center") %>% html_nodes("td")
-#' ateam %>% html_nodes("center") %>% html_nodes("font")
+#' ateam %>% html_elements("center") %>% html_elements("td")
+#' ateam %>% html_elements("center") %>% html_elements("font")
 #'
-#' td <- ateam %>% html_nodes("center") %>% html_nodes("td")
+#' td <- ateam %>% html_elements("center") %>% html_elements("td")
 #' td
-#' # When applied to a list of nodes, html_nodes() returns all matching nodes
-#' # beneath any of the elements, flattening results into a new nodelist.
-#' td %>% html_nodes("font")
 #'
-#' # html_node() returns the first matching node. If there are no matching
-#' # nodes, it returns a "missing" node
-#' td %>% html_node("font")
+#' # When applied to a node set, html_elements() returns all matching elements
+#' # beneath any of the inputs, flattening results into a new node set.
+#' td %>% html_elements("font")
+#'
+#' # html_element() returns the first matching element. If there are no matching
+#' # nodes, it returns a "missing" element
+#' td %>% html_element("font")
+#' # and html_text() and html_attr() will return NA
+#' td %>% html_element("font") %>% html_text()
 #'
 #' # To pick out an element or elements at specified positions, use [[ and [
-#' ateam %>% html_nodes("table") %>% .[[1]] %>% html_nodes("img")
-#' ateam %>% html_nodes("table") %>% .[1:2] %>% html_nodes("img")
-html_nodes <- function(x, css, xpath) {
-  UseMethod("html_nodes")
+#' ateam %>% html_elements("table") %>% .[[1]] %>% html_elements("img")
+#' ateam %>% html_elements("table") %>% .[1:2] %>% html_elements("img")
+html_element <- function(x, css, xpath) {
+  UseMethod("html_element")
 }
 
 #' @export
-html_nodes.default <- function(x, css, xpath) {
-  xml2::xml_find_all(x, make_selector(css, xpath))
+#' @rdname html_element
+html_elements <- function(x, css, xpath) {
+  UseMethod("html_elements")
 }
 
 #' @export
-#' @rdname html_nodes
-html_node <- function(x, css, xpath) {
-  UseMethod("html_node")
+html_elements.default <- function(x, css, xpath) {
+  xml2::xml_find_all(x, make_selector(css, xpath))
 }
 
 #' @export
-html_node.default <- function(x, css, xpath) {
+html_element.default <- function(x, css, xpath) {
   xml2::xml_find_first(x, make_selector(css, xpath))
 }
 

diff --git a/R/session.R b/R/session.R
@@ -10,7 +10,7 @@
 #' * Submit an [html_form] with `session_submit()`.
 #' * View the history with `session_history()` and navigate back and forward
 #'   with `back()` and `forward()`.
-#' * Extract page contents with [html_node()] and [html_nodes()], or get the
+#' * Extract page contents with [html_element()] and [html_elements()], or get the
 #'   complete HTML document with [read_html()].
 #' * Inspect the HTTP response with [httr::cookies()], [httr::headers()],
 #'   and [httr::status_code()].
@@ -35,7 +35,7 @@
 #' \donttest{
 #' s %>%
 #'   follow_link(css = "p a") %>%
-#'   html_nodes("p")
+#'   html_elements("p")
 #' }
 html_session <- function(url, ...) {
   session <-   structure(
@@ -97,7 +97,7 @@ jump_to <- function(x, url, ...) {
 
 #' @param i A integer to select the ith link or a string to match the
 #'  first link containing that text (case sensitive).
-#' @inheritParams html_node
+#' @inheritParams html_element
 #' @export
 #' @rdname html_session
 follow_link <- function(x, i, css, xpath, ...) {
@@ -115,7 +115,7 @@ find_href <- function(x, i, css, xpath) {
 
   if (!missing(i)) {
     stopifnot(length(i) == 1)
-    a <- html_nodes(x, "a")
+    a <- html_elements(x, "a")
 
     if (is.numeric(i)) {
       out <- a[[i]]
@@ -131,7 +131,7 @@ find_href <- function(x, i, css, xpath) {
       abort("`i` must a string or integer")
     }
   } else {
-    a <- html_nodes(x, css = css, xpath = xpath)
+    a <- html_elements(x, css = css, xpath = xpath)
     if (length(a) == 0) {
       abort("No links matched `css`/`xpath`")
     }
@@ -334,13 +334,13 @@ html_table.rvest_session <- function(x,
 }
 
 #' @export
-html_node.rvest_session <- function(x, css, xpath) {
-  html_node(read_html(x), css, xpath)
+html_element.rvest_session <- function(x, css, xpath) {
+  html_element(read_html(x), css, xpath)
 }
 
 #' @export
-html_nodes.rvest_session <- function(x, css, xpath) {
-  html_nodes(read_html(x), css, xpath)
+html_elements.rvest_session <- function(x, css, xpath) {
+  html_elements(read_html(x), css, xpath)
 }
 
 # httr methods -----------------------------------------------------------------

diff --git a/R/table.R b/R/table.R
@@ -22,7 +22,7 @@
 #'   <tr><td>10</td><td>z</td></tr>
 #' </table>")
 #' sample1 %>%
-#'   html_node("table") %>%
+#'   html_element("table") %>%
 #'   html_table()
 #'
 #' # Values in merged cells will be duplicated
@@ -33,7 +33,7 @@
 #'   <tr><td>6</td><td colspan='2'>7</td></tr>
 #' </table>")
 #' sample2 %>%
-#'   html_node("table") %>%
+#'   html_element("table") %>%
 #'   html_table()
 #'
 #' # If a row is missing cells, they'll be filled with NAs
@@ -44,7 +44,7 @@
 #'   <tr><td>4</td></tr>
 #' </table>")
 #' sample3 %>%
-#'   html_node("table") %>%
+#'   html_element("table") %>%
 #'   html_table()
 html_table <- function(x,
                        header = NA,

diff --git a/R/text.R b/R/text.R
@@ -1,12 +1,12 @@
-#' Get text from nodes
+#' Get text from elements
 #'
 #' @description
-#' There are two ways to retrieve text from a node: `html_text()` and
+#' There are two ways to retrieve text from a element: `html_text()` and
 #' `html_text2()`. `html_text()` is a thin wrapper around [xml2::xml_text()]
-#' which returns just the text nodes. `html_text2()` simulates how text looks
-#' in a browser, using an approach inspired by the javascript
-#' [innerText](https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText)
-#' function. Roughly speaking, it converts `<br />` to `"\n"`, adds blank lines
+#' which returns just the raw underlying text. `html_text2()` simulates how
+#' text looks in a browser, using an approach inspired by javascript's
+#' [innerText()](https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText).
+#' Roughly speaking, it converts `<br />` to `"\n"`, adds blank lines
 #' around `<p>` tags, and lightly formats tabular data.
 #'
 #' `html_text2()` is usually what you want, but it is much slower than
@@ -27,17 +27,17 @@
 #'
 #' # html_text() returns the raw underlying text, which includes whitespace
 #' # that would be ignored by a browser, and ignores the <br>
-#' html %>% html_node("p") %>% html_text() %>% writeLines()
+#' html %>% html_element("p") %>% html_text() %>% writeLines()
 #'
 #' # html_text2() simulates what a browser would display. Non-significant
 #' # whitespace is collapsed, and <br> is turned into a line break
-#' html %>% html_node("p") %>% html_text2() %>% writeLines()
+#' html %>% html_element("p") %>% html_text2() %>% writeLines()
 #'
 #' # By default, html_text2() also converts non-breaking spaces to regular
 #' # spaces:
 #' html <- minimal_html("<p>x&nbsp;y</p>")
-#' x1 <- html %>% html_node("p") %>% html_text()
-#' x2 <- html %>% html_node("p") %>% html_text2()
+#' x1 <- html %>% html_element("p") %>% html_text()
+#' x2 <- html %>% html_element("p") %>% html_text2()
 #'
 #' # When printed, non-breaking spaces look exactly like regular spaces
 #' x1