diff --git a/NAMESPACE b/NAMESPACE index db80170..e06eb0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(read_doc) export(read_document) export(read_docx) export(read_html) +export(read_odt) export(read_pdf) export(read_pptx) export(read_rtf) diff --git a/NEWS b/NEWS index 71ad9b6..8f90e31 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,8 @@ BUG FIXES NEW FEATURES +* `read_odt` added to read in .odt files. + MINOR FEATURES IMPROVEMENTS diff --git a/R/read_document.R b/R/read_document.R index ad5ef4b..b9269e8 100644 --- a/R/read_document.R +++ b/R/read_document.R @@ -76,6 +76,7 @@ read_document <- function(file, skip = 0, remove.empty = TRUE, trim = TRUE, fun <- switch(filetype, pdf = {function(x, ...) {read_pdf(x, remove.empty = FALSE, trim = FALSE, ocr = ocr, ...)[["text"]]}}, docx = {function(x, ...) {read_docx(x, remove.empty = FALSE, trim = FALSE, ...)}}, + odt = {function(x, ...) {read_odt(x, remove.empty = FALSE, trim = FALSE, ...)}}, doc = {function(x, ...) {read_doc(x, remove.empty = FALSE, trim = FALSE, format=format, ...)}}, rtf = {function(x, ...) {read_rtf(x, remove.empty = FALSE, trim = FALSE, ...)}}, html = {function(x, ...) {read_html(x, remove.empty = FALSE, trim = FALSE, ...)}}, diff --git a/R/read_otf.R b/R/read_odt.R similarity index 80% rename from R/read_otf.R rename to R/read_odt.R index d894f10..f885f05 100644 --- a/R/read_otf.R +++ b/R/read_odt.R @@ -1,8 +1,8 @@ -#' Read in .docx Content +#' Read in .odt Content #' -#' Read in the content from a .docx file. +#' Read in the content from a .odt file. #' -#' @param file The path to the .docx file. +#' @param file The path to the .odt file. #' @param skip The number of lines to skip. #' @param remove.empty logical. If `TRUE` empty elements in the vector are #' removed. @@ -10,12 +10,11 @@ #' removed. #' @param ... ignored. #' @return Returns a character vector. -#' @keywords docx +#' @keywords odt #' @export -#' @author Bryan Goodrich and Tyler Rinker . #' @examples #' \dontrun{ -#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Yasmine_Interview_Transcript.docx" +#' url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt" #' file <- download(url) #' (txt <- read_odt(file)) #' } @@ -43,7 +42,7 @@ read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) { doc <- xml2::read_xml(xmlfile) ## extract the content - rm_na <- function(x) x[!is.na(x)] + #rm_na <- function(x) x[!is.na(x)] pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p")) @@ -59,4 +58,4 @@ read_odt <- function (file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) { -read_odt("C:/Users/trinker/Desktop/Hello World.odt") + diff --git a/R/read_transcript.R b/R/read_transcript.R index 913efcb..4f6dd9e 100644 --- a/R/read_transcript.R +++ b/R/read_transcript.R @@ -60,6 +60,8 @@ #' (doc4 <- system.file("docs/trans4.xlsx", package = "textreadr")) #' (doc5 <- system.file("docs/trans5.xls", package = "textreadr")) #' (doc6 <- system.file("docs/trans6.doc", package = "textreadr")) +#' ##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr")) +#' (doc8 <- system.file("docs/trans8.odt", package = "textreadr")) #' #' dat1 <- read_transcript(doc1) #' dat2 <- read_transcript(doc1, col.names = c("person", "dialogue")) @@ -76,8 +78,8 @@ #' #' ## MS doc format #' \dontrun{ -#' dat7 <- read_transcript(doc6) ## need to skip Researcher -#' dat8 <- read_transcript(doc6, skip = 1) +#' dat6b <- read_transcript(doc6) ## need to skip Researcher +#' dat6c <- read_transcript(doc6, skip = 1) #' } #' #' ## rtf format @@ -87,7 +89,10 @@ #' ) #' dat9 <- read_transcript(rtf_doc, skip = 1) #' } -#' +#' +#' ## odt format +#' read_transcript(doc8) +#' #' ## text string input #' trans <- "sam: Computer is fun. Not too fun. #' greg: No it's not, it's dumb. @@ -147,7 +152,7 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke } if (is.null(sep)) { - if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf')) { + if (y %in% c("docx", "doc", "txt", "text", 'pdf', 'rtf', 'odt')) { sep <- ":" } else { sep <- "," @@ -179,6 +184,14 @@ function(file, col.names = c("Person", "Dialogue"), text.var = NULL, merge.broke paste(which(sep_hits), collapse=", ")) } }, + odt = { + x <- read.odt(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar) + sep_hits <- grepl(sep, x[, 2]) + if(any(sep_hits)) { + warning(sprintf("The following text contains the \"%s\" separator and may not have split correctly:\n", sep), + paste(which(sep_hits), collapse=", ")) + } + }, rtf = { x <- read.rtf(file, skip = skip, sep = sep, max.person.nchar = max.person.nchar, ...) sep_hits <- grepl(sep, x[, 2]) @@ -305,6 +318,45 @@ function(file, skip = 0, sep = ":", max.person.nchar = 20) { } + +read.odt <- +function(file, skip = 0, sep = ":", max.person.nchar = 20) { + + ## create temp dir + tmp <- tempfile() + if (!dir.create(tmp)) stop("Temporary directory could not be established.") + + ## clean up + on.exit(unlink(tmp, recursive=TRUE)) + + ## unzip docx + xmlfile <- file.path(tmp, "content.xml") + utils::unzip(file, exdir = tmp) + + ## Import XML + doc <- xml2::read_xml(xmlfile) + + ## extract the content + pvalues <- xml2::xml_text(xml2::xml_find_all(doc, "//text:p")) + + pvalues <- pvalues[!grepl("^\\s*$", pvalues)] # Remove empty lines + if (skip > 0) pvalues <- pvalues[-seq(skip)] # Ignore these many lines + if (any(grepl(paste0("^.{", max.person.nchar, ",}", sep), pvalues))) { + warning(sprintf(paste0( + "I've detected the separator beyond %s characters from the line start. Parsing may be incorrect...\n", + " Consider manually searching the .docx for use of the separator in-text rather than to separate person/text." + ), max.person.nchar)) + } + keys <- sapply(gregexpr(paste0("^.*?", sep), pvalues), function(x) x > 0) + speaker <- regmatches(pvalues, gregexpr(paste0("^.*?", sep), pvalues)) + pvalues <- gsub(paste0("^.*?", sep), "", pvalues) # Remove speaker from lines + speaker <- rep(speaker[which(keys)], diff(c(which(keys), length(speaker)+1))) + speaker <- unlist(speaker) # Make sure it's a vector + speaker <- substr(speaker, 1, nchar(speaker)-nchar(sep)) # Remove ending colon + transcript <- data.frame(X1 = trimws(speaker), X2 = trimws(pvalues), stringsAsFactors = FALSE) + return(transcript) +} + read.rtf <- function(file, skip = 0, sep = ":", max.person.nchar = 20, ...) { diff --git a/README.Rmd b/README.Rmd index 30da766..0073bba 100644 --- a/README.Rmd +++ b/README.Rmd @@ -50,7 +50,8 @@ The main functions, task category, & descriptions are summarized in the table be | `read_document` | reading | Generic text reader for .doc, .docx, .rtf, .txt, .pdf | | `read_html` | reading | Read .html | | `read_pdf` | reading | Read .pdf | -| `read_dir` | reading | Read and format multiple .doc, .docx, .rtf, .txt, .pdf files | +| `read_odt` | reading | Read .odt | +| `read_dir` | reading | Read and format multiple .doc, .docx, .rtf, .txt, .pdf, .pptx, .odt files | | `read_dir_transcript` | reading | Read and format multiple transcript files | | `download` | downloading | Download documents | | `peek` | viewing | Truncated viewing of `data.frame`s | @@ -96,7 +97,8 @@ pdf_doc <- system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr" html_doc <- system.file('docs/textreadr_creed.html', package = "textreadr") txt_doc <- system.file('docs/textreadr_creed.txt', package = "textreadr") pptx_doc <- system.file('docs/Hello_World.pptx', package = "textreadr") - +odt_doc <- system.file('docs/Hello_World.odt', package = "textreadr") + rtf_doc <- download( 'https://raw.githubusercontent.com/trinker/textreadr/master/inst/docs/trans7.rtf' ) @@ -134,7 +136,7 @@ system.file("docs", package = "textreadr") %>% We can open files as well: -```{r} +```{r, eval = FALSE} html_doc %>% browse() ``` @@ -142,18 +144,23 @@ html_doc %>% ## Generic Document Reading -The `read_document` is a generic wrapper for `read_docx`, `read_doc`, `read_html`, and `read_pdf` that detects the file extension and chooses the correct reader. For most tasks that require reading a .docx, .doc, .html, .pdf, or .txt file this is the go-to function to get the job done. Below I demonstrate reading each of these five file formats with `read_document`. +The `read_document` is a generic wrapper for `read_docx`, `read_doc`, `read_html`, `read_odt`, `read_pdf`, `read_rtf`, and `read_pptx` that detects the file extension and chooses the correct reader. For most tasks that require reading a .docx, .doc, .html, .odt, .pdf, .pptx, .rtf or .txt file this is the go-to function to get the job done. Below I demonstrate reading each of these five file formats with `read_document`. ```{r} + +doc_doc %>% + read_document() %>% + head(3) + docx_doc %>% read_document() %>% head(3) - -doc_doc %>% + +html_doc %>% read_document() %>% head(3) -rtf_doc %>% +odt_doc %>% read_document() %>% head(3) @@ -161,7 +168,11 @@ pdf_doc %>% read_document() %>% head(3) -html_doc %>% +pptx_doc %>% + read_document() %>% + head(3) + +rtf_doc %>% read_document() %>% head(3) @@ -169,10 +180,6 @@ txt_doc %>% read_document() %>% paste(collapse = "\n") %>% cat() - -pptx_doc %>% - read_document() %>% - head(3) ``` @@ -214,7 +221,25 @@ system.file("docs/Maas2011/pos", package = "textreadr") %>% ``` -## Read .docx +## Basic Readers + +### Read .doc + +A .doc file is a bit trickier to read in than .docx but is made easy by the **antiword** package which wraps the [Antiword](http://www.winfield.demon.nl) program in an OS independent way. + +```{r} +doc_doc %>% + read_doc() %>% + head() +``` + +```{r} +doc_doc %>% + read_doc(15) %>% + head(7) +``` + +### Read .docx A .docx file is nothing but a fancy container. It can be parsed via XML. The `read_docx` function allows the user to read in a .docx file as plain text. Elements are essentially the p tags (explicitly `//w:t` tags collapsed with `//w:p` tags) in the markup. @@ -230,34 +255,27 @@ docx_doc %>% head(3) ``` -## Read .doc -A .doc file is a bit trickier to read in than .docx but is made easy by the **antiword** package which wraps the [Antiword](http://www.winfield.demon.nl) program in an OS independent way. +### Read .html -```{r} -doc_doc %>% - read_doc() %>% - head() -``` +Often a researcher only wishes to grab the text from the body of .html files. The `read_html` function does exactly this task. For finer control over .html scraping the user may investigate the **xml2** & **rvest** packages for parsing .html and .xml files. Here I read in HTML with `read_html`. ```{r} -doc_doc %>% - read_doc(15) %>% - head(7) +html_doc %>% + read_html() ``` +### Read .odt -## Read .rtf +Open Document Texts (.odt) are rather similar to .docx files in how they behave. The `read_odt` function reads them in in a similar way. -Rich text format (.rtf) is a plain text document with markup similar to latex. The **striprtf** package provides the backend for `read_rtf`. ```{r} -rtf_doc %>% - read_rtf() +odt_doc %>% + read_odt() ``` - -## Read .pdf +### Read .pdf Like .docx a .pdf file is simply a container. Reading PDF's is made easier with a number of command line tools. A few methods of PDF reading have been incorporated into R. Here I wrap **pdftools** `pdf_text` to produce `read_pdf`, a function with sensible defaults that is designed to read PDFs into R for as many folks as possible right out of the box. @@ -268,7 +286,7 @@ pdf_doc %>% read_pdf() ``` -### Image Based .pdf: OCR +#### Image Based .pdf: OCR Image based .pdfs require optical character recognition (OCR) in order for the images to be converted to text. The `ocr` argument of `read_pdf` allows the user to read in image based .pdf files and allow the [**tesseract**](https://CRAN.R-project.org/package=tesseract) package do the heavy lifting in the backend. You can look at the .pdf we'll be using by running: @@ -319,7 +337,7 @@ pdf_doc_img %>% ## .. ... ... ... ``` -## Read .pptx +### Read .pptx Like the .docx, a .pptx file is also nothing but a fancy container. Likewise, it can be parsed via XML. The `read_pptx` function allows the user to read in a .pptx file as a data.frame with plain text that tracks slide id numbers. @@ -328,16 +346,17 @@ pptx_doc %>% read_pptx() ``` +### Read .rtf -## Read .html - -Often a researcher only wishes to grab the text from the body of .html files. The `read_html` function does exactly this task. For finer control over .html scraping the user may investigate the **xml2** & **rvest** packages for parsing .html and .xml files. Here I read in HTML with `read_html`. +Rich text format (.rtf) is a plain text document with markup similar to latex. The **striprtf** package provides the backend for `read_rtf`. ```{r} -html_doc %>% - read_html() +rtf_doc %>% + read_rtf() ``` + + ## Read Transcripts Many researchers store their dialogue data (including interviews and observations) as a .docx or .xlsx file. Typically the data is a two column format with the person in the first column and the text in the second separated by some sort of separator (often a colon). The `read_transcript` wraps up many of these assumptions into a reader that will extract the data as a data frame with a person and text column. The `skip` argument is very important for correct parsing. @@ -348,6 +367,14 @@ Here I read in and parse the different formats `read_transcript` handles. These base_name(trans_docs) ``` + +### doc + +```{r} +read_transcript(trans_docs[6], skip = 1) +``` + + ### docx Simple ```{r} @@ -373,29 +400,29 @@ read_transcript(trans_docs[3], sep = "-", skip = 1) ``` -### xls and xlsx - +### odt ```{r} -read_transcript(trans_docs[4]) -read_transcript(trans_docs[5]) +read_transcript(trans_docs[8]) ``` +### rtf -### doc - ```{r} -read_transcript(trans_docs[6], skip = 1) +read_transcript(rtf_doc, skip = 1) ``` -### rtf +### xls and xlsx + ```{r} -read_transcript(rtf_doc, skip = 1) +read_transcript(trans_docs[4]) +read_transcript(trans_docs[5]) ``` + ### Reading Text Like `read.table`, `read_transcript` also has a `text` argument which is useful for demoing code. diff --git a/README.md b/README.md index 6f98b71..cf0bc62 100644 --- a/README.md +++ b/README.md @@ -32,20 +32,22 @@ Table of Contents - [Browse](#browse) - [Generic Document Reading](#generic-document-reading) - [Read Directory Contents](#read-directory-contents) - - [Read .docx](#read-docx) - - [Read .doc](#read-doc) - - [Read .rtf](#read-rtf) - - [Read .pdf](#read-pdf) - - [Image Based .pdf: OCR](#image-based-pdf-ocr) - - [Read .pptx](#read-pptx) - - [Read .html](#read-html) + - [Basic Readers](#basic-readers) + - [Read .doc](#read-doc) + - [Read .docx](#read-docx) + - [Read .html](#read-html) + - [Read .odt](#read-odt) + - [Read .pdf](#read-pdf) + - [Read .pptx](#read-pptx) + - [Read .rtf](#read-rtf) - [Read Transcripts](#read-transcripts) + - [doc](#doc) - [docx Simple](#docx-simple) - [docx With Skip](#docx-with-skip) - [docx With Dash Separator](#docx-with-dash-separator) - - [xls and xlsx](#xls-and-xlsx) - - [doc](#doc) + - [odt](#odt) - [rtf](#rtf) + - [xls and xlsx](#xls-and-xlsx) - [Reading Text](#reading-text) - [Authentic Interview](#authentic-interview) - [Pairing textreadr](#pairing-textreadr) @@ -153,21 +155,26 @@ table below: Read .pdf -read_dir +read_odt reading -Read and format multiple .doc, .docx, .rtf, .txt, .pdf files +Read .odt +read_dir +reading +Read and format multiple .doc, .docx, .rtf, .txt, .pdf, .pptx, .odt files + + read_dir_transcript reading Read and format multiple transcript files - + download downloading Download documents - + peek viewing Truncated viewing of data.frames @@ -220,7 +227,8 @@ Load the Packages/Data html_doc <- system.file('docs/textreadr_creed.html', package = "textreadr") txt_doc <- system.file('docs/textreadr_creed.txt', package = "textreadr") pptx_doc <- system.file('docs/Hello_World.pptx', package = "textreadr") - + odt_doc <- system.file('docs/Hello_World.odt', package = "textreadr") + rtf_doc <- download( 'https://raw.githubusercontent.com/trinker/textreadr/master/inst/docs/trans7.rtf' ) @@ -247,7 +255,7 @@ Here I download a .docx file of presidential debated from 2012. read_docx() %>% head(3) - ## pres.deb1.docx read into C:\Users\trinker\AppData\Local\Temp\RtmpkZ09wS + ## pres.deb1.docx read into C:\Users\trinker\AppData\Local\Temp\RtmpsLX1ol ## [1] "LEHRER: We'll talk about -- specifically about health care in a moment. But what -- do you support the voucher system, Governor?" ## [2] "ROMNEY: What I support is no change for current retirees and near-retirees to Medicare. And the president supports taking $716 billion out of that program." @@ -271,29 +279,38 @@ Generic Document Reading ------------------------ The `read_document` is a generic wrapper for `read_docx`, `read_doc`, -`read_html`, and `read_pdf` that detects the file extension and chooses -the correct reader. For most tasks that require reading a .docx, .doc, -.html, .pdf, or .txt file this is the go-to function to get the job -done. Below I demonstrate reading each of these five file formats with +`read_html`, `read_odt`, `read_pdf`, `read_rtf`, and `read_pptx` that +detects the file extension and chooses the correct reader. For most +tasks that require reading a .docx, .doc, .html, .odt, .pdf, .pptx, .rtf +or .txt file this is the go-to function to get the job done. Below I +demonstrate reading each of these five file formats with `read_document`. + doc_doc %>% + read_document() %>% + head(3) + + ## [1] "JRMC2202 Audio Project" "Interview Transcript" "Interviewer: Yasmine Hassan" + docx_doc %>% read_document() %>% head(3) ## [1] "JRMC2202 Audio Project" "Interview Transcript" "Interviewer: Yasmine Hassan" - doc_doc %>% + html_doc %>% read_document() %>% head(3) - ## [1] "JRMC2202 Audio Project" "Interview Transcript" "Interviewer: Yasmine Hassan" + ## [1] "textreadr Creed" + ## [2] "The textreadr package aims to be a lightweight tool kit that handles 80% of an analyst’s text reading in needs." + ## [3] "The package handles .docx, .doc, .pdf, .html, .pptx, and .txt." - rtf_doc %>% + odt_doc %>% read_document() %>% head(3) - ## [1] "Researcher 2:\tOctober 7, 1892." "Teacher 4:\tStudents it’s time to learn." "[Student discussion; unintelligible]" + ## [1] "Hello World" "I am Open Document Text Format!" pdf_doc %>% read_document() %>% @@ -303,13 +320,17 @@ done. Below I demonstrate reading each of these five file formats with ## [2] "CRAIG BREADEN: My name is Craig Breaden. I’m the audiovisual archivist at Duke University," ## [3] "and I’m with Kirston Johnson, the curator of the Archive of Documentary Arts at Duke. The date" - html_doc %>% + pptx_doc %>% read_document() %>% head(3) - ## [1] "textreadr Creed" - ## [2] "The textreadr package aims to be a lightweight tool kit that handles 80% of an analyst’s text reading in needs." - ## [3] "The package handles .docx, .doc, .pdf, .html, .pptx, and .txt." + ## [1] "Hello World" "Tyler Rinker" "Slide 1" + + rtf_doc %>% + read_document() %>% + head(3) + + ## [1] "Researcher 2:\tOctober 7, 1892." "Teacher 4:\tStudents it’s time to learn." "[Student discussion; unintelligible]" txt_doc %>% read_document() %>% @@ -334,12 +355,6 @@ done. Below I demonstrate reading each of these five file formats with ## | ROracle | Oracle | ## | RJDBC | JDBC | - pptx_doc %>% - read_document() %>% - head(3) - - ## [1] "Hello World" "Tyler Rinker" "Slide 1" - Read Directory Contents ----------------------- @@ -403,8 +418,32 @@ Here we have read the files in, one row per file. ## 20 9_7 Working-class romantic drama from direct ## .. ... ... -Read .docx ----------- +Basic Readers +------------- + +### Read .doc + +A .doc file is a bit trickier to read in than .docx but is made easy by +the **antiword** package which wraps the +[Antiword](http://www.winfield.demon.nl) program in an OS independent +way. + + doc_doc %>% + read_doc() %>% + head() + + ## [1] "JRMC2202 Audio Project" "Interview Transcript" "Interviewer: Yasmine Hassan" "Narrator: Ahmad Abd Rabou" "Date: 16/10/2014" "Place: Narrator's office" + + doc_doc %>% + read_doc(15) %>% + head(7) + + ## [1] "Hassan: Could you please tell me your name, your title, your age," "and your place of ref," + ## [3] "umm, residence?" "Abd Rabou: My name is Ahmad Abd Rabou. I'm assistant professor of" + ## [5] "comparative politics at" "both Cairo University and The American University" + ## [7] "in Cairo. I'm 34 years old. I" + +### Read .docx A .docx file is nothing but a fancy container. It can be parsed via XML. The `read_docx` function allows the user to read in a .docx file as @@ -425,47 +464,43 @@ collapsed with `//w:p` tags) in the markup. ## [2] "Abd Rabou: My name is Ahmad Abd Rabou. I’m assistant professor of comparative politics at" ## [3] "both Cairo University and The American University in Cairo. I’m 34 years old. I" -Read .doc ---------- - -A .doc file is a bit trickier to read in than .docx but is made easy by -the **antiword** package which wraps the -[Antiword](http://www.winfield.demon.nl) program in an OS independent -way. +### Read .html - doc_doc %>% - read_doc() %>% - head() - - ## [1] "JRMC2202 Audio Project" "Interview Transcript" "Interviewer: Yasmine Hassan" "Narrator: Ahmad Abd Rabou" "Date: 16/10/2014" "Place: Narrator's office" +Often a researcher only wishes to grab the text from the body of .html +files. The `read_html` function does exactly this task. For finer +control over .html scraping the user may investigate the **xml2** & +**rvest** packages for parsing .html and .xml files. Here I read in HTML +with `read_html`. - doc_doc %>% - read_doc(15) %>% - head(7) + html_doc %>% + read_html() - ## [1] "Hassan: Could you please tell me your name, your title, your age," "and your place of ref," - ## [3] "umm, residence?" "Abd Rabou: My name is Ahmad Abd Rabou. I'm assistant professor of" - ## [5] "comparative politics at" "both Cairo University and The American University" - ## [7] "in Cairo. I'm 34 years old. I" + ## [1] "textreadr Creed" + ## [2] "The textreadr package aims to be a lightweight tool kit that handles 80% of an analyst’s text reading in needs." + ## [3] "The package handles .docx, .doc, .pdf, .html, .pptx, and .txt." + ## [4] "If you have another format there is likely already another popular R package that specializes in this read in task. For example, got XML, use the xml2 package, authored by Hadley Wickham, Jim Hester, & Jeroen Ooms. Need special handling for .html? Use Hadley Wickham’s rvest package. Got SQL? Oh boy there’s a bunch of great ways to read it into R." + ## [5] "R Package" + ## [6] "SQL" + ## [7] "ROBDC" + ## [8] "Microsoft SQL Server" + ## [9] "RMySQL" + ## [10] "MySQL" + ## [11] "ROracle" + ## [12] "Oracle" + ## [13] "RJDBC" + ## [14] "JDBC" -Read .rtf ---------- +### Read .odt -Rich text format (.rtf) is a plain text document with markup similar to -latex. The **striprtf** package provides the backend for `read_rtf`. +Open Document Texts (.odt) are rather similar to .docx files in how they +behave. The `read_odt` function reads them in in a similar way. - rtf_doc %>% - read_rtf() + odt_doc %>% + read_odt() - ## [1] "Researcher 2:\tOctober 7, 1892." - ## [2] "Teacher 4:\tStudents it’s time to learn." - ## [3] "[Student discussion; unintelligible]" - ## [4] "Multiple Students:\tYes teacher we‘re ready to learn." - ## [5] "Teacher 4:\tLet's read this terrific book together. It's called Moo Baa La La La and – what was I going to … Oh yes — The story is by Sandra Boynton." - ## [6] "“A cow says Moo. A Sheep says Baa. Three singing pigs say LA LA LA! \"No, no!\" you say, that isn't right. The pigs say oink all day and night. Rhinoceroses snort and snuff. And little dogs go ruff ruff ruff! Some other dogs go bow wow wow! And cats and kittens say Meow! Quack! Says the duck. A horse says neigh. It's quiet now. What do you say? ”" + ## [1] "Hello World" "I am Open Document Text Format!" -Read .pdf ---------- +### Read .pdf Like .docx a .pdf file is simply a container. Reading PDF’s is made easier with a number of command line tools. A few methods of PDF reading @@ -494,7 +529,7 @@ with meta data, including page numbers and element (row) ids. ## 10 1 10 1940’s? ## .. ... ... ... -### Image Based .pdf: OCR +#### Image Based .pdf: OCR Image based .pdfs require optical character recognition (OCR) in order for the images to be converted to text. The `ocr` argument of `read_pdf` @@ -540,8 +575,7 @@ default behavior of `read_pdf`. ## 10 1 10 such as clustering, classication, etc. ## .. ... ... ... -Read .pptx ----------- +### Read .pptx Like the .docx, a .pptx file is also nothing but a fancy container. Likewise, it can be parsed via XML. The `read_pptx` function allows the @@ -572,32 +606,20 @@ slide id numbers. ## 18: 4 6 Green ## 19: 4 7 Orange -Read .html ----------- +### Read .rtf -Often a researcher only wishes to grab the text from the body of .html -files. The `read_html` function does exactly this task. For finer -control over .html scraping the user may investigate the **xml2** & -**rvest** packages for parsing .html and .xml files. Here I read in HTML -with `read_html`. +Rich text format (.rtf) is a plain text document with markup similar to +latex. The **striprtf** package provides the backend for `read_rtf`. - html_doc %>% - read_html() + rtf_doc %>% + read_rtf() - ## [1] "textreadr Creed" - ## [2] "The textreadr package aims to be a lightweight tool kit that handles 80% of an analyst’s text reading in needs." - ## [3] "The package handles .docx, .doc, .pdf, .html, .pptx, and .txt." - ## [4] "If you have another format there is likely already another popular R package that specializes in this read in task. For example, got XML, use the xml2 package, authored by Hadley Wickham, Jim Hester, & Jeroen Ooms. Need special handling for .html? Use Hadley Wickham’s rvest package. Got SQL? Oh boy there’s a bunch of great ways to read it into R." - ## [5] "R Package" - ## [6] "SQL" - ## [7] "ROBDC" - ## [8] "Microsoft SQL Server" - ## [9] "RMySQL" - ## [10] "MySQL" - ## [11] "ROracle" - ## [12] "Oracle" - ## [13] "RJDBC" - ## [14] "JDBC" + ## [1] "Researcher 2:\tOctober 7, 1892." + ## [2] "Teacher 4:\tStudents it’s time to learn." + ## [3] "[Student discussion; unintelligible]" + ## [4] "Multiple Students:\tYes teacher we‘re ready to learn." + ## [5] "Teacher 4:\tLet's read this terrific book together. It's called Moo Baa La La La and – what was I going to … Oh yes — The story is by Sandra Boynton." + ## [6] "“A cow says Moo. A Sheep says Baa. Three singing pigs say LA LA LA! \"No, no!\" you say, that isn't right. The pigs say oink all day and night. Rhinoceroses snort and snuff. And little dogs go ruff ruff ruff! Some other dogs go bow wow wow! And cats and kittens say Meow! Quack! Says the duck. A horse says neigh. It's quiet now. What do you say? ”" Read Transcripts ---------------- @@ -615,7 +637,19 @@ handles. These are the files that will be read in: base_name(trans_docs) - ## [1] "trans1.docx" "trans2.docx" "trans3.docx" "trans4.xlsx" "trans5.xls" "trans6.doc" "trans7.rtf" "transcripts" + ## [1] "trans1.docx" "trans2.docx" "trans3.docx" "trans4.xlsx" "trans5.xls" "trans6.doc" "trans7.rtf" "trans8.odt" "transcripts" + +### doc + + read_transcript(trans_docs[6], skip = 1) + + ## Table: [3 x 2] + ## + ## Person Dialogue + ## 1 Teacher 4 Students it's time to learn. [Student di + ## 2 Multiple Students Yes teacher we're ready to learn. + ## 3 Teacher 4 Let's read this terrific book together. + ## . ... ... ### docx Simple @@ -678,14 +712,36 @@ separator the first go round. ## 3 Teacher 4 Let's read this terrific book together. ## . ... ... +### odt + + read_transcript(trans_docs[8]) + + ## Table: [4 x 2] + ## + ## Person Dialogue + ## 1 Researcher 2 October 7,1892. + ## 2 Teacher4 Students it's time to learn. [Student di + ## 3 Multiple Students Yes teacher we're ready to learn. + ## 4 Teacher4 Let's read this terrific book together. + ## . ... ... + +### rtf + + read_transcript(rtf_doc, skip = 1) + + ## Table: [4 x 2] + ## + ## Person Dialogue + ## 1 Researcher 2 October 7, 1892. + ## 2 Teacher 4 Students it's time to learn. [Student di + ## 3 Multiple Students Yes teacher we're ready to learn. + ## 4 Teacher 4 Let's read this terrific book together. + ## . ... ... + ### xls and xlsx read_transcript(trans_docs[4]) - ## New names: - ## * `` -> ...1 - ## * `` -> ...2 - ## Table: [7 x 2] ## ## Person Dialogue @@ -700,10 +756,6 @@ separator the first go round. read_transcript(trans_docs[5]) - ## New names: - ## * `` -> ...1 - ## * `` -> ...2 - ## Table: [7 x 2] ## ## Person Dialogue @@ -716,31 +768,6 @@ separator the first go round. ## 7 Teacher 4: Let's read this terrific book together. ## . ... ... -### doc - - read_transcript(trans_docs[6], skip = 1) - - ## Table: [3 x 2] - ## - ## Person Dialogue - ## 1 Teacher 4 Students it's time to learn. [Student di - ## 2 Multiple Students Yes teacher we're ready to learn. - ## 3 Teacher 4 Let's read this terrific book together. - ## . ... ... - -### rtf - - read_transcript(rtf_doc, skip = 1) - - ## Table: [4 x 2] - ## - ## Person Dialogue - ## 1 Researcher 2 October 7, 1892. - ## 2 Teacher 4 Students it's time to learn. [Student di - ## 3 Multiple Students Yes teacher we're ready to learn. - ## 4 Teacher 4 Let's read this terrific book together. - ## . ... ... - ### Reading Text Like `read.table`, `read_transcript` also has a `text` argument which is @@ -810,7 +837,16 @@ I demonstrate pairings with if (!require("pacman")) install.packages("pacman"); library(pacman) p_load(dplyr, qdapRegex) p_load_current_gh(file.path('trinker', c('textreadr', 'textshape', 'textclean'))) - + + ## + ## checking for file 'C:\Users\trinker\AppData\Local\Temp\RtmpsLX1ol\remotes2aa410a9521e\trinker-textreadr-b176aa5/DESCRIPTION' ... v checking for file 'C:\Users\trinker\AppData\Local\Temp\RtmpsLX1ol\remotes2aa410a9521e\trinker-textreadr-b176aa5/DESCRIPTION' (351ms) + ## - preparing 'textreadr': (1.2s) + ## checking DESCRIPTION meta-information ... checking DESCRIPTION meta-information ... v checking DESCRIPTION meta-information + ## - checking for LF line-endings in source and make files and shell scripts + ## - checking for empty or unneeded directories + ## - building 'textreadr_1.0.3.tar.gz' + ## + ## ## Read in pdf, split on variables dat <- 'http://scdb.wustl.edu/_brickFiles/2012_01/SCDB_2012_01_codebook.pdf' %>% diff --git a/man/read_odt.Rd b/man/read_odt.Rd new file mode 100644 index 0000000..4ee61e3 --- /dev/null +++ b/man/read_odt.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_odt.R +\name{read_odt} +\alias{read_odt} +\title{Read in .odt Content} +\usage{ +read_odt(file, skip = 0, remove.empty = TRUE, trim = TRUE, ...) +} +\arguments{ +\item{file}{The path to the .odt file.} + +\item{skip}{The number of lines to skip.} + +\item{remove.empty}{logical. If \code{TRUE} empty elements in the vector are +removed.} + +\item{trim}{logical. If \code{TRUE} the leading/training white space is +removed.} + +\item{...}{ignored.} +} +\value{ +Returns a character vector. +} +\description{ +Read in the content from a .odt file. +} +\examples{ +\dontrun{ +url <- "https://github.com/trinker/textreadr/raw/master/inst/docs/Hello_World.odt" +file <- download(url) +(txt <- read_odt(file)) +} +} +\keyword{odt} diff --git a/man/read_transcript.Rd b/man/read_transcript.Rd index d8c8ad1..fa86d38 100644 --- a/man/read_transcript.Rd +++ b/man/read_transcript.Rd @@ -104,6 +104,8 @@ each transcript for errors before further parsing the data. (doc4 <- system.file("docs/trans4.xlsx", package = "textreadr")) (doc5 <- system.file("docs/trans5.xls", package = "textreadr")) (doc6 <- system.file("docs/trans6.doc", package = "textreadr")) +##(doc7 <- system.file("docs/trans7.rtf", package = "textreadr")) +(doc8 <- system.file("docs/trans8.odt", package = "textreadr")) dat1 <- read_transcript(doc1) dat2 <- read_transcript(doc1, col.names = c("person", "dialogue")) @@ -120,8 +122,8 @@ dat6 <- read_transcript(doc5) ## MS doc format \dontrun{ -dat7 <- read_transcript(doc6) ## need to skip Researcher -dat8 <- read_transcript(doc6, skip = 1) +dat6b <- read_transcript(doc6) ## need to skip Researcher +dat6c <- read_transcript(doc6, skip = 1) } ## rtf format @@ -132,6 +134,9 @@ rtf_doc <- download( dat9 <- read_transcript(rtf_doc, skip = 1) } +## odt format +read_transcript(doc8) + ## text string input trans <- "sam: Computer is fun. Not too fun. greg: No it's not, it's dumb.