Skip to content

Commit

Permalink
Initial update to pdfbox2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tpaskhalis committed Mar 30, 2018
1 parent 4e3eae1 commit d848d40
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 19 deletions.
2 changes: 1 addition & 1 deletion R/extract_tables.R
Expand Up @@ -70,7 +70,7 @@ function(file,
pages <- as.integer(pages)
pageIterator <- oe$extract(make_pages(pages))
}
npages <- pdfDocument$getDocumentCatalog()$getAllPages()$size()
npages <- pdfDocument$getNumberOfPages()
area <- make_area(area = area, pages = pages, npages = npages)
columns <- make_columns(columns = columns, pages = pages, npages = npages)

Expand Down
2 changes: 1 addition & 1 deletion R/extract_text.R
Expand Up @@ -25,7 +25,7 @@ extract_text <- function(file, pages = NULL, password = NULL, encoding = NULL) {
pdfDocument <- load_doc(file, password = password)
on.exit(pdfDocument$close())

stripper <- new(J("org.apache.pdfbox.util.PDFTextStripper"))
stripper <- new(J("org.apache.pdfbox.text.PDFTextStripper"))

if (!is.null(pages)) {
pages <- as.integer(pages)
Expand Down
4 changes: 2 additions & 2 deletions R/get_page_dims.R
Expand Up @@ -33,7 +33,7 @@ get_page_dims <- function(file, doc, pages = NULL, password = NULL) {
pages <- 1L:(get_n_pages(doc = doc))
}

allpages <- doc$getDocumentCatalog()$getAllPages()
allpages <- doc$getDocumentCatalog()$getPages()
lapply(pages, function(x) {
thispage <- allpages$get(x-1L)
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
Expand All @@ -47,5 +47,5 @@ get_n_pages <- function(file, doc, password = NULL) {
doc <- load_doc(file, password = password, copy = FALSE)
on.exit(doc$close())
}
doc$getDocumentCatalog()$getAllPages()$size()
doc$getNumberOfPages()
}
27 changes: 18 additions & 9 deletions R/make_thumbnails.R
Expand Up @@ -22,8 +22,12 @@
#' @importFrom rJava J new
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_text}}, \code{\link{make_thumbnails}}
#' @export
make_thumbnails <-
function(file, outdir = NULL, pages = NULL, format = c("png", "jpeg", "bmp", "gif"), resolution = 72L, password = NULL) {
make_thumbnails <- function(file,
outdir = NULL,
pages = NULL,
format = c("png", "jpeg", "bmp", "gif"),
resolution = 72L,
password = NULL) {
file <- localize_file(file)
pdfDocument <- load_doc(file, password = password)
on.exit(pdfDocument$close())
Expand All @@ -37,17 +41,22 @@ function(file, outdir = NULL, pages = NULL, format = c("png", "jpeg", "bmp", "gi
format <- match.arg(format)
fileseq <- formatC(pages, width = max(nchar(pages)), flag = 0)
if (is.null(outdir)) {
prefix <- basename(file_path_sans_ext(file))
outfile <- paste0(file_path_sans_ext(file), fileseq, ".", format)
} else {
prefix <- file.path(outdir, basename(file_path_sans_ext(file)))
outfile <- file.path(outdir, paste0(basename(file_path_sans_ext(file)), fileseq, ".", format))
filename <- paste0(basename(file_path_sans_ext(file)), fileseq, ".", format)
outfile <- file.path(outdir, filename)
}
for (i in seq_along(pages)) {
PDFImageWriter <- new(J("org.apache.pdfbox.util.PDFImageWriter"))
PDFImageWriter$writeImage(pdfDocument, format, "", pages[i], pages[i],
prefix, 1L, as.integer(resolution))
pageIndex <- pages[i] - 1L
PDFRenderer <- new(J("org.apache.pdfbox.rendering.PDFRenderer"),
document = pdfDocument)
# BufferedImage <- PDFRenderer$renderImageWithDPI(pages[i],
# scale = as.double(resolution))
BufferedImage <- PDFRenderer$renderImage(pageIndex)
JavaFile <- new(J("java.io.File"), pathname = outfile[i])
J("javax.imageio.ImageIO")$write(BufferedImage,
format,
JavaFile)
}
file.rename(from = paste0(prefix, pages, ".", format), to = outfile)
ifelse(file.exists(outfile), outfile, NA_character_)
}
4 changes: 2 additions & 2 deletions R/split_merge.R
Expand Up @@ -30,7 +30,7 @@ split_pdf <- function(file, outdir = NULL, password = NULL) {
file <- localize_file(file, copy = TRUE)
pdfDocument <- load_doc(file, password = password)
on.exit(pdfDocument$close())
splitter <- new(J("org.apache.pdfbox.util.Splitter"))
splitter <- new(J("org.apache.pdfbox.multipdf.Splitter"))
splitArray <- splitter$split(pdfDocument)
iterator <- splitArray$iterator()
p <- 1L
Expand All @@ -57,7 +57,7 @@ split_pdf <- function(file, outdir = NULL, password = NULL) {
merge_pdfs <- function(file, outfile) {
outfile <- path.expand(outfile)
file <- unlist(lapply(file, localize_file, copy = TRUE))
merger <- new(J("org.apache.pdfbox.util.PDFMergerUtility"))
merger <- new(J("org.apache.pdfbox.multipdf.PDFMergerUtility"))
merger$setDestinationFileName(outfile)
lapply(file, merger$addSource)
merger$mergeDocuments()
Expand Down
7 changes: 3 additions & 4 deletions R/utils.R
Expand Up @@ -19,12 +19,11 @@ localize_file <- function(path, copy = FALSE, quiet = TRUE) {
load_doc <- function(file, password = NULL, copy = TRUE) {
file <- localize_file(path = file, copy = copy)
pdfDocument <- new(J("org.apache.pdfbox.pdmodel.PDDocument"))
scratchFileJava <- new(J("java.io.File"), scratchFile <- tempfile())
randomAccess <- new(J("org.apache.pdfbox.io.RandomAccessFile"), scratchFileJava, "rw")
fileJava <- new(J("java.io.File"), pathname = file)
if (is.null(password)) {
doc <- pdfDocument$load(filename = file, scratchFile = randomAccess)
doc <- pdfDocument$load(file = fileJava)
} else {
doc <- pdfDocument$load(filename = file, scratchFile = randomAccess, password = password)
doc <- pdfDocument$load(file = fileJava, password = password)
}
pdfDocument$close()
doc
Expand Down

0 comments on commit d848d40

Please sign in to comment.