* [How to Work With a PDF in Python](https://realpython.com/pdf-python/)
* [Python for Pdf](https://towardsdatascience.com/python-for-pdf-ef0fac2808b0)
* [PyPdf2](http://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2/)

* [pdfrw](https://github.com/pmaupin/pdfrw)

* [pdfminder](https://github.com/euske/pdfminer)
    * CLI utility
    
* [tabula-py](https://github.com/chezou/tabula-py)
    * [tabula-py: Extract table from PDF into Python DataFrame](https://blog.chezo.uno/tabula-py-extract-table-from-pdf-into-python-dataframe-6c7acfa5f302)

    * [Parse Data from PDFs with Tabula and Pandas](https://hackersandslackers.com/data-from-pdfs-tabula-pandas/)
    
* [Python Web Scraping PDF Tables](https://medium.com/@DRicky.Ch29/web-scraping-pdf-tables-data-cleaning-part-1-cb6d8d47a6de)

In [88]:
import os
import tabula

from PyPDF2 import PdfFileReader, PdfFileWriter
 
def get_info(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
 
        author = info.author
        creator = info.creator
        producer = info.producer
        subject = info.subject
        title = info.title
    
        return dict(author=author, creator=creator, subject=subject, title=title, num_pages=number_of_pages)
    
    return None
    
def extract_text(path):
    page_text_list = []
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        for i in range(pdf.getNumPages()):
            text = pdf.getPage(i).extractText()
            page_text_list.append(text)
    return page_text_list

def extract_table(path):
    return tabula.read_pdf(pdf_path, pages="all", multiple_tables=True)
    
def split_pdf(path):
    fname = os.path.splitext(os.path.basename(path))[0]
 
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
 
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
 
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
 
        print('Created: {}'.format(output_filename))
    
def merge_pdf(out_pdf, in_pdf_list):
    merged_pages = 0

    pdf_writer = PdfFileWriter()
    for path in in_pdf_list:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            merged_pages += 1
            pdf_writer.addPage(pdf_reader.getPage(page))
            
    if len(in_pdf_list) > 1 and merged_pages > 0:
        print(f"{merged_pages} pages from {','.join(in_pdf_list)} are merged into {out_pdf}")
        with open(out_pdf, 'wb') as fh:
            pdf_writer.write(fh)
              
def merger(output_path, input_paths):
    pdf_merger = PdfFileMerger()
    file_handles = []
 
    for path in input_paths:
        pdf_merger.append(path)
 
    with open(output_path, 'wb') as fileobj:
        pdf_merger.write(fileobj)
              
def watermark(input_pdf, output_pdf, watermark_pdf):
    watermark = PdfFileReader(watermark_pdf)
    watermark_page = watermark.getPage(0)
 
    pdf = PdfFileReader(input_pdf)
    pdf_writer = PdfFileWriter()
 
    for page in range(pdf.getNumPages()):
        pdf_page = pdf.getPage(page)
        pdf_page.mergePage(watermark_page)
        pdf_writer.addPage(pdf_page)
 
    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)
              
def rotator(path):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(path)
 
    page1 = pdf_reader.getPage(0).rotateClockwise(90)
    pdf_writer.addPage(page1)
    page2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
    pdf_writer.addPage(page2)
    pdf_writer.addPage(pdf_reader.getPage(2))
 
    with open('pdf_rotator.pdf', 'wb') as fh:
        pdf_writer.write(fh)
              
def encrypt_pdf(input_pdf, output_pdf, password):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(input_pdf)

    for page in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(page))

    pdf_writer.encrypt(user_pwd=password, owner_pwd=None, 
                       use_128bit=True)

    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)

In [89]:
# list pdf files in current folder
!ls *.pdf

data_tables_page_1.pdf	data_tables.pdf  fw9_protected.pdf
data_tables_page_2.pdf	fw9.pdf		 test_merge.pdf


## get started

In [90]:
pdf_path = 'data_tables.pdf'

In [91]:
print(get_info(pdf_path))

{'author': None, 'creator': 'Adobe InDesign 2.0.2', 'subject': None, 'title': 'Sample Data for Data Tables', 'num_pages': 2}




In [92]:
# extract text
f = open(pdf_path, 'rb')

In [93]:
pdf = PdfFileReader(f)

In [94]:
pdf.numPages, pdf.getNumPages()

(2, 2)

In [95]:
page = pdf.getPage(0)

In [96]:
page.items()

dict_items([('/Type', '/Page'), ('/MediaBox', [0, 0, 612, 792]), ('/Parent', IndirectObject(18, 0)), ('/BleedBox', [0, 0, 612, 792]), ('/TrimBox', [0, 0, 612, 792]), ('/CropBox', [0, 0, 612, 792]), ('/ArtBox', [0, 0, 612, 792]), ('/Contents', [IndirectObject(37, 0), IndirectObject(42, 0), IndirectObject(44, 0), IndirectObject(46, 0), IndirectObject(48, 0), IndirectObject(56, 0), IndirectObject(58, 0), IndirectObject(114, 0)]), ('/Resources', {'/Font': {'/C2_0': IndirectObject(27, 0), '/C2_1': IndirectObject(25, 0), '/C2_2': IndirectObject(30, 0), '/C0_0': IndirectObject(35, 0), '/C2_3': IndirectObject(41, 0), '/C2_4': IndirectObject(53, 0), '/C2_5': IndirectObject(55, 0)}, '/ExtGState': {'/GS0': IndirectObject(115, 0), '/GS1': IndirectObject(116, 0), '/GS2': IndirectObject(117, 0)}, '/ProcSet': ['/PDF', '/Text']}), ('/Rotate', 0)])

In [97]:
text = page.extractText()

In [98]:
type(text), text

(str,
 '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')

In [99]:
type(page.getContents()[0])

PyPDF2.generic.IndirectObject

In [100]:
page.getContents()[0]

IndirectObject(37, 0)

In [101]:
page.raw_get('/Contents')

[IndirectObject(37, 0),
 IndirectObject(42, 0),
 IndirectObject(44, 0),
 IndirectObject(46, 0),
 IndirectObject(48, 0),
 IndirectObject(56, 0),
 IndirectObject(58, 0),
 IndirectObject(114, 0)]

In [102]:
pdf_path = 'data_tables.pdf'
print(extract_text(pdf_path))

['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']


## extract text

Although `extract_text()` fails to extract meaningful text from `data_tables.pdf` file, it could be due to particular PDF version when that file was created and limitation of `pypdf2`.

It works with another file `fw9.pdf`

In [103]:
pdf_path = 'fw9.pdf'
print(get_info(pdf_path))

{'author': 'SE:W:CAR:MP', 'creator': 'Adobe LiveCycle Designer ES 9.0', 'subject': 'Request for Taxpayer Identification Number and Certification', 'title': 'Form W-9 (Rev. October 2018)', 'num_pages': 6}


In [104]:
# get last page of W9 form
page_texts = extract_text(pdf_path)

page_texts[-1]

'Form W-9 (Rev. 10-2018)Page 6 The IRS does not initiate contacts with taxpayers via emails. Also, the IRS does not request personal detailed information through email or ask taxpayers for the PIN numbers, passwords, or similar secret access \ninformation for their credit card, bank, or other financial accounts.If you receive an unsolicited email claiming to be from the IRS, forward this message to phishing@irs.gov. You may also report misuse \nof the IRS name, logo, or other IRS property to the Treasury Inspector \nGeneral for Tax Administration (TIGTA) at 1-800-366-4484. You can \nforward suspicious emails to the Federal Trade Commission at \nspam@uce.gov or report them at www.ftc.gov/complaint. You can \ncontact the FTC at www.ftc.gov/idtheft or 877-IDTHEFT (877-438-4338). \nIf you have been the victim of identity theft, see www.IdentityTheft.gov \nand Pub. 5027.Visit www.irs.gov/IdentityTheft to learn more about identity theft and how to reduce your risk.Privacy Act NoticeSection 6

### pdfminer (https://github.com/euske/pdfminer)

Consider using pdfminer for working with text extraction because it has good `CLI` utility
```
$ pip install pdfminer
$ pdf2txt.py data_tables.pdf
```

## encrypt

In [105]:
encrypt_pdf(input_pdf='fw9.pdf', output_pdf='fw9_protected.pdf', password='secret')

## split into pages

In [106]:
split_pdf(pdf_path)

Created: fw9_page_1.pdf
Created: fw9_page_2.pdf
Created: fw9_page_3.pdf
Created: fw9_page_4.pdf
Created: fw9_page_5.pdf
Created: fw9_page_6.pdf


## merge

In [107]:
merge_pdf("test_merge.pdf", ['fw9.pdf', 'data_tables_page_1.pdf'])

7 pages from fw9.pdf,data_tables_page_1.pdf are merged into test_merge.pdf


## extract table to Pandas dataframe

In [108]:
pdf_path = 'data_tables.pdf'
df_tables = extract_table(pdf_path)

In [109]:
len(df_tables)

3

In [110]:
df_tables[0]

Unnamed: 0,0,1
0,Number of Coils,Number of Paperclips
1,5,"3, 5, 4"
2,10,"7, 8, 6"
3,15,"11, 10, 12"
4,20,"15, 13, 14"


In [111]:
df_tables[1]

Unnamed: 0,0,1,2,3,4
0,Speed (mph),Driver,Car,Engine,Date
1,407.447,Craig Breedlove,Spirit of America,GE J47,8/5/63
2,413.199,Tom Green,Wingfoot Express,WE J46,10/2/64
3,434.22,Art Arfons,Green Monster,GE J79,10/5/64
4,468.719,Craig Breedlove,Spirit of America,GE J79,10/13/64
5,526.277,Craig Breedlove,Spirit of America,GE J79,10/15/65
6,536.712,Art Arfons,Green Monster,GE J79,10/27/65
7,555.127,Craig Breedlove,"Spirit of America, Sonic 1",GE J79,11/2/65
8,576.553,Art Arfons,Green Monster,GE J79,11/7/65
9,600.601,Craig Breedlove,"Spirit of America, Sonic 1",GE J79,11/15/65


In [112]:
df_tables[2]

Unnamed: 0,0,1
0,Time (drops of water),Distance (cm)
1,1,10119
2,2,"29, 31, 30"
3,3,"59, 58, 61"
4,4,"102, 100, 98"
5,5,"122, 125, 127"
