/
read_odt.R
61 lines (45 loc) · 1.52 KB
/
read_odt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#' Read in .odt Content
#'
#' Read in the content from a .odt file.
#'
#' @param file The path to the .odt file.
#' @param skip The number of lines to skip.
#' @param remove.empty logical. If `TRUE` empty elements in the vector are
#' removed.
#' @param trim logical. If `TRUE` the leading/training white space is
#' removed.
#' @param ... ignored.
#' @return Returns a character vector.
#' @keywords odt
#' @export
#' @examples
#' \dontrun{
#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt"
#' file <- download(url)
#' (txt <- read_odt(file))
#' }
read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
filetype <- tools::file_ext(file)
if (filetype %in% c('odt') && grepl('^([fh]ttp)', file)){
file <- download(file)
}
## create temp dir
tmp <- tempfile()
if (!dir.create(tmp)) stop("Temporary directory could not be established.")
## clean up
on.exit(unlink(tmp, recursive=TRUE))
## unzip docx
xmlfile <- file.path(tmp, "content.xml")
utils::unzip(file, exdir = tmp)
## read in the unzipped docx
doc <- xml2::read_xml(xmlfile)
## extract the content
#rm_na <- function(x) x[!is.na(x)]
pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))
## formatting
if (isTRUE(remove.empty)) pvalues <- pvalues[!grepl("^\\s*$", pvalues)]
if (skip > 0) pvalues <- pvalues[-seq(skip)]
if (isTRUE(trim)) pvalues <- trimws(pvalues)
if (length(pvalues) == 0) pvalues <- ''
pvalues
}