Added documentation for odt

trinker · Jun 19, 2020 · dea4e01 · dea4e01
1 parent b176aa5
commit dea4e01
Show file tree

Hide file tree

Showing 9 changed files with 346 additions and 188 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(read_doc)
 export(read_document)
 export(read_docx)
 export(read_html)
+export(read_odt)
 export(read_pdf)
 export(read_pptx)
 export(read_rtf)

diff --git a/NEWS b/NEWS
@@ -25,6 +25,8 @@ BUG FIXES
 
 NEW FEATURES
 
+* `read_odt` added to read in .odt files.
+
 MINOR FEATURES
 
 IMPROVEMENTS

diff --git a/R/read_document.R b/R/read_document.R
@@ -76,6 +76,7 @@ read_document <- function(file, skip = 0, remove.empty = TRUE, trim = TRUE,
     fun <- switch(filetype,
         pdf = {function(x, ...) {read_pdf(x, remove.empty = FALSE, trim = FALSE, ocr = ocr, ...)[["text"]]}},
         docx = {function(x, ...) {read_docx(x, remove.empty = FALSE, trim = FALSE, ...)}},
+        odt = {function(x, ...) {read_odt(x, remove.empty = FALSE, trim = FALSE, ...)}},        
         doc = {function(x, ...) {read_doc(x, remove.empty = FALSE, trim = FALSE, format=format, ...)}},
         rtf = {function(x, ...) {read_rtf(x, remove.empty = FALSE, trim = FALSE, ...)}},
         html = {function(x, ...) {read_html(x, remove.empty = FALSE, trim = FALSE, ...)}},

diff --git a/R/read_otf.R → R/read_odt.R b/R/read_otf.R → R/read_odt.R
@@ -1,21 +1,20 @@
-#' Read in .docx Content
+#' Read in .odt Content
 #'
-#' Read in the content from a .docx file.
+#' Read in the content from a .odt file.
 #'
-#' @param file The path to the .docx file.
+#' @param file The path to the .odt file.
 #' @param skip The number of lines to skip.
 #' @param remove.empty logical.  If `TRUE` empty elements in the vector are
 #' removed.
 #' @param trim logical.  If `TRUE` the leading/training white space is
 #' removed.
 #' @param ... ignored.
 #' @return Returns a character vector.
-#' @keywords docx
+#' @keywords odt
 #' @export
-#' @author Bryan Goodrich and Tyler Rinker <tyler.rinker@@gmail.com>.
 #' @examples
 #' \dontrun{
-#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Yasmine_Interview_Transcript.docx"
+#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt"
 #' file <- download(url)
 #' (txt <- read_odt(file))
 #' }
@@ -43,7 +42,7 @@ read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
     doc <- xml2::read_xml(xmlfile)
 
     ## extract the content
-    rm_na <- function(x) x[!is.na(x)]
+    #rm_na <- function(x) x[!is.na(x)]
 
     pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))
 
@@ -59,4 +58,4 @@ read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) {
 
 
 
-read_odt("C:/Users/trinker/Desktop/Hello World.odt")
+
diff --git a/R/read_transcript.R b/R/read_transcript.R
@@ -60,6 +60,8 @@
 #' (doc4 <- system.file("docs/trans4.xlsx", package = "textreadr"))
 #' (doc5 <- system.file("docs/trans5.xls", package = "textreadr"))
 #' (doc6 <- system.file("docs/trans6.doc", package = "textreadr"))
+#' ##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr"))
+#' (doc8 <- system.file("docs/trans8.odt", package = "textreadr"))
 #'
 #' dat1 <- read_transcript(doc1)
 #' dat2 <- read_transcript(doc1, col.names = c("person", "dialogue"))
@@ -76,8 +78,8 @@
 #'
 #' ## MS doc format
 #' \dontrun{
-#' dat7 <- read_transcript(doc6) ## need to skip Researcher
-#' dat8 <- read_transcript(doc6, skip = 1)
+#' dat6b <- read_transcript(doc6) ## need to skip Researcher
+#' dat6c <- read_transcript(doc6, skip = 1)
 #' }
 #'
 #' ## rtf format
@@ -87,7 +89,10 @@
 #' )
 #' dat9 <- read_transcript(rtf_doc, skip = 1)
 #' }
-#'
+#' 
+#' ## odt format
+#' read_transcript(doc8)
+#' 
 #' ## text string input
 #' trans <- "sam: Computer is fun. Not too fun.
 #' greg: No it's not, it's dumb.
@@ -147,7 +152,7 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
     }
 
     if (is.null(sep)) {
-        if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf')) {
+        if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf', 'odt')) {
             sep <- ":"
         } else {
             sep <- ","
@@ -179,6 +184,14 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke
                         paste(which(sep_hits), collapse=", "))
             }
         },
+        odt = {
+            x <- read.odt(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar)
+            sep_hits <- grepl(sep, x[, 2])
+            if(any(sep_hits)) {
+                warning(sprintf("The following text contains the \"%s\" separator and may not have split correctly:\n", sep),
+                    paste(which(sep_hits), collapse=", "))
+                }
+            },       
         rtf = {
             x <- read.rtf(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar, ...)
             sep_hits <- grepl(sep, x[, 2])
@@ -305,6 +318,45 @@ function(file, skip = 0, sep = ":", max.person.nchar = 20) {
 }
 
 
+
+read.odt <-
+function(file, skip = 0, sep = ":", max.person.nchar = 20) {
+
+    ## create temp dir
+    tmp <- tempfile()
+    if (!dir.create(tmp)) stop("Temporary directory could not be established.")
+
+    ## clean up
+    on.exit(unlink(tmp, recursive=TRUE))
+
+    ## unzip docx
+    xmlfile <- file.path(tmp, "content.xml")
+    utils::unzip(file, exdir = tmp)
+
+    ## Import XML
+    doc <- xml2::read_xml(xmlfile)
+
+    ## extract the content
+    pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p"))
+
+    pvalues <- pvalues[!grepl("^\\s*$", pvalues)]  # Remove empty lines
+    if (skip > 0) pvalues <- pvalues[-seq(skip)]   # Ignore these many lines
+    if (any(grepl(paste0("^.{", max.person.nchar, ",}", sep), pvalues))) {
+        warning(sprintf(paste0(
+            "I've detected the separator beyond %s characters from the line start.  Parsing may be incorrect...\n",
+            "  Consider manually searching the .docx for use of the separator in-text rather than to separate person/text."
+        ), max.person.nchar))
+    }
+    keys    <- sapply(gregexpr(paste0("^.*?", sep), pvalues), function(x) x > 0)
+    speaker <- regmatches(pvalues, gregexpr(paste0("^.*?", sep), pvalues))
+    pvalues <- gsub(paste0("^.*?", sep), "", pvalues)  # Remove speaker from lines
+    speaker <- rep(speaker[which(keys)], diff(c(which(keys), length(speaker)+1)))
+    speaker <- unlist(speaker)  # Make sure it's a vector
+    speaker <- substr(speaker, 1, nchar(speaker)-nchar(sep)) # Remove ending colon
+    transcript <- data.frame(X1 = trimws(speaker), X2 = trimws(pvalues), stringsAsFactors = FALSE)
+    return(transcript)
+}
+
 read.rtf <-
 function(file, skip = 0, sep = ":", max.person.nchar = 20, ...) {