Skip to content

Commit

Permalink
Added documentation for odt
Browse files Browse the repository at this point in the history
  • Loading branch information
trinker committed Jun 19, 2020
1 parent b176aa5 commit dea4e01
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 188 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -13,6 +13,7 @@ export(read_doc)
export(read_document)
export(read_docx)
export(read_html)
export(read_odt)
export(read_pdf)
export(read_pptx)
export(read_rtf)
Expand Down
2 changes: 2 additions & 0 deletions NEWS
Expand Up @@ -25,6 +25,8 @@ BUG FIXES

NEW FEATURES

* `read_odt` added to read in .odt files.

MINOR FEATURES

IMPROVEMENTS
Expand Down
1 change: 1 addition & 0 deletions R/read_document.R
Expand Up @@ -76,6 +76,7 @@ read_document <- function(file, skip = 0, remove.empty = TRUE, trim = TRUE,
fun <- switch(filetype,
pdf = {function(x, ...) {read_pdf(x, remove.empty = FALSE, trim = FALSE, ocr = ocr, ...)[["text"]]}},
docx = {function(x, ...) {read_docx(x, remove.empty = FALSE, trim = FALSE, ...)}},
odt = {function(x, ...) {read_odt(x, remove.empty = FALSE, trim = FALSE, ...)}},
doc = {function(x, ...) {read_doc(x, remove.empty = FALSE, trim = FALSE, format=format, ...)}},
rtf = {function(x, ...) {read_rtf(x, remove.empty = FALSE, trim = FALSE, ...)}},
html = {function(x, ...) {read_html(x, remove.empty = FALSE, trim = FALSE, ...)}},
Expand Down
15 changes: 7 additions & 8 deletions R/read_otf.R → R/read_odt.R
@@ -1,21 +1,20 @@
#' Read in .docx Content
#' Read in .odt Content
#'
#' Read in the content from a .docx file.
#' Read in the content from a .odt file.
#'
#' @param file The path to the .docx file.
#' @param file The path to the .odt file.
#' @param skip The number of lines to skip.
#' @param remove.empty logical. If `TRUE` empty elements in the vector are
#' removed.
#' @param trim logical. If `TRUE` the leading/training white space is
#' removed.
#' @param ... ignored.
#' @return Returns a character vector.
#' @keywords docx
#' @keywords odt
#' @export
#' @author Bryan Goodrich and Tyler Rinker <tyler.rinker@@gmail.com>.
#' @examples
#' \dontrun{
#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Yasmine_Interview_Transcript.docx"
#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt"
#' file <- download(url)
#' (txt <- read_odt(file))
#' }
Expand Down Expand Up @@ -43,7 +42,7 @@ read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
doc <- xml2::read_xml(xmlfile)

## extract the content
rm_na <- function(x) x[!is.na(x)]
#rm_na <- function(x) x[!is.na(x)]

pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))

Expand All @@ -59,4 +58,4 @@ read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {



read_odt("C:/Users/trinker/Desktop/Hello World.odt")

60 changes: 56 additions & 4 deletions R/read_transcript.R
Expand Up @@ -60,6 +60,8 @@
#' (doc4 <- system.file("docs/trans4.xlsx", package = "textreadr"))
#' (doc5 <- system.file("docs/trans5.xls", package = "textreadr"))
#' (doc6 <- system.file("docs/trans6.doc", package = "textreadr"))
#' ##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr"))
#' (doc8 <- system.file("docs/trans8.odt", package = "textreadr"))
#'
#' dat1 <- read_transcript(doc1)
#' dat2 <- read_transcript(doc1, col.names = c("person", "dialogue"))
Expand All @@ -76,8 +78,8 @@
#'
#' ## MS doc format
#' \dontrun{
#' dat7 <- read_transcript(doc6) ## need to skip Researcher
#' dat8 <- read_transcript(doc6, skip = 1)
#' dat6b <- read_transcript(doc6) ## need to skip Researcher
#' dat6c <- read_transcript(doc6, skip = 1)
#' }
#'
#' ## rtf format
Expand All @@ -87,7 +89,10 @@
#' )
#' dat9 <- read_transcript(rtf_doc, skip = 1)
#' }
#'
#'
#' ## odt format
#' read_transcript(doc8)
#'
#' ## text string input
#' trans <- "sam: Computer is fun. Not too fun.
#' greg: No it's not, it's dumb.
Expand Down Expand Up @@ -147,7 +152,7 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
}

if (is.null(sep)) {
if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf')) {
if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf', 'odt')) {
sep <- ":"
} else {
sep <- ","
Expand Down Expand Up @@ -179,6 +184,14 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
paste(which(sep_hits), collapse=", "))
}
},
odt = {
x <- read.odt(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar)
sep_hits <- grepl(sep, x[, 2])
if(any(sep_hits)) {
warning(sprintf("The following text contains the \"%s\" separator and may not have split correctly:\n", sep),
paste(which(sep_hits), collapse=", "))
}
},
rtf = {
x <- read.rtf(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar, ...)
sep_hits <- grepl(sep, x[, 2])
Expand Down Expand Up @@ -305,6 +318,45 @@ function(file, skip = 0, sep = ":", max.person.nchar = 20) {
}



read.odt <-
function(file, skip = 0, sep = ":", max.person.nchar = 20) {

## create temp dir
tmp <- tempfile()
if (!dir.create(tmp)) stop("Temporary directory could not be established.")

## clean up
on.exit(unlink(tmp, recursive=TRUE))

## unzip docx
xmlfile <- file.path(tmp, "content.xml")
utils::unzip(file, exdir = tmp)

## Import XML
doc <- xml2::read_xml(xmlfile)

## extract the content
pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))

pvalues <- pvalues[!grepl("^\\s*$", pvalues)] # Remove empty lines
if (skip > 0) pvalues <- pvalues[-seq(skip)] # Ignore these many lines
if (any(grepl(paste0("^.{", max.person.nchar, ",}", sep), pvalues))) {
warning(sprintf(paste0(
"I've detected the separator beyond %s characters from the line start. Parsing may be incorrect...\n",
" Consider manually searching the .docx for use of the separator in-text rather than to separate person/text."
), max.person.nchar))
}
keys <- sapply(gregexpr(paste0("^.*?", sep), pvalues), function(x) x > 0)
speaker <- regmatches(pvalues, gregexpr(paste0("^.*?", sep), pvalues))
pvalues <- gsub(paste0("^.*?", sep), "", pvalues) # Remove speaker from lines
speaker <- rep(speaker[which(keys)], diff(c(which(keys), length(speaker)+1)))
speaker <- unlist(speaker) # Make sure it's a vector
speaker <- substr(speaker, 1, nchar(speaker)-nchar(sep)) # Remove ending colon
transcript <- data.frame(X1 = trimws(speaker), X2 = trimws(pvalues), stringsAsFactors = FALSE)
return(transcript)
}

read.rtf <-
function(file, skip = 0, sep = ":", max.person.nchar = 20, ...) {

Expand Down

0 comments on commit dea4e01

Please sign in to comment.