/
tabulizer_summer_school_ub_2016.R
55 lines (43 loc) · 2.09 KB
/
tabulizer_summer_school_ub_2016.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# Xavier de Pedro. Copyleft 2016 (cc-by-sa). xavier.depedro@vhir.org - xavier.depedro@seeds4c.org
# From
# http://www.r-bloggers.com/when-documents-become-databases-tabulizer-r-wrapper-for-tabula-pdf-table-extractor/
# https://github.com/leeper/tabulizer/blob/master/vignettes/tabulizer.Rmd
# https://github.com/leeper/tabulizer
if(!require("ghit")){
install.packages("ghit")
}
# on 64-bit Windows
#ghit::install_github(c("leeper/tabulizerjars", "leeper/tabulizer"), INSTALL_opts = "--no-multiarch")
# elsewhere
ghit::install_github(c("leeper/tabulizerjars", "leeper/tabulizer"))
# Useful commands
library("tabulizer")
test_demo <- F # Enable this flag if you'd like to play with some demo instructions and documents.
if (test_demo) {
f <- system.file("examples", "data.pdf", package = "tabulizer")
# extract table from first page of example PDF
tab <- extract_tables(f, pages = 1)
head(tab[[1]])
extract_tables(f, pages = 2, method = "data.frame")
str(extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)), guess = FALSE, method = "data.frame"))
extract_areas(f, 1)
extract_tables(f, method = "csv")
f2 <- "https://github.com/leeper/tabulizer/raw/master/inst/examples/data.pdf"
extract_tables(f2, pages = 2)
}
# Let's fire the task on that data set from
# http://www.aspb.cat/quefem/docs/InformeSalut2014_2010.pdf
# for pages: 74-77 (which are page numbers in the real pdf document: 75-78)
pdffile <- "InformeSalut2014_2010.pdf"
# Be aware of where you will download the file (adapt as needed with setwd("path") )
getwd()
download.file(url="http://www.aspb.cat/quefem/docs/InformeSalut2014_2010.pdf", destfile=pdffile)
p <- pdffile
#extract_tables(p, pages = c(75:78), method = "csv")
extract_areas(p, pages = c(75:78), method = "csv")
# Open Those csv files in LibreOffice or similar, and marge by hand, since this is a once way process
# and not a repetitive task that would need to be automated some way, etc.
# (easier and faster by hand for this use case)
# And fill in the district blanks so that it behaves like a proper database
# Sand save as ODS, or Excel, etc.
# Done!