http://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2/

In [68]:
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
 
def get_info(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
 
        author = info.author
        creator = info.creator
        producer = info.producer
        subject = info.subject
        title = info.title
    
        return dict(author=author, creator=creator, subject=subject, title=title, num_pages=number_of_pages)
    
    return None
    
def text_extractor(path):
    page_text_list = []
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        for i in range(pdf.getNumPages()):
            text = pdf.getPage(i).extractText()
            page_text_list.append(text)
    return page_text_list

def pdf_splitter(path):
    fname = os.path.splitext(os.path.basename(path))[0]
 
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
 
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
 
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
 
        print('Created: {}'.format(output_filename))
    
def merge_pdf(out_pdf, in_pdf_list):
    merged_pages = 0

    pdf_writer = PdfFileWriter()
    for path in in_pdf_list:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            merged_pages += 1
            pdf_writer.addPage(pdf_reader.getPage(page))
            
    if len(in_pdf_list) > 1 and merged_pages > 0:
        print(f"{merged_pages} pages from {','.join(in_pdf_list)} are merged into {out_pdf}")
        with open(out_pdf, 'wb') as fh:
            pdf_writer.write(fh)
              
def merger(output_path, input_paths):
    pdf_merger = PdfFileMerger()
    file_handles = []
 
    for path in input_paths:
        pdf_merger.append(path)
 
    with open(output_path, 'wb') as fileobj:
        pdf_merger.write(fileobj)
              
def watermark(input_pdf, output_pdf, watermark_pdf):
    watermark = PdfFileReader(watermark_pdf)
    watermark_page = watermark.getPage(0)
 
    pdf = PdfFileReader(input_pdf)
    pdf_writer = PdfFileWriter()
 
    for page in range(pdf.getNumPages()):
        pdf_page = pdf.getPage(page)
        pdf_page.mergePage(watermark_page)
        pdf_writer.addPage(pdf_page)
 
    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)
              
def rotator(path):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(path)
 
    page1 = pdf_reader.getPage(0).rotateClockwise(90)
    pdf_writer.addPage(page1)
    page2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
    pdf_writer.addPage(page2)
    pdf_writer.addPage(pdf_reader.getPage(2))
 
    with open('pdf_rotator.pdf', 'wb') as fh:
        pdf_writer.write(fh)

In [58]:
!ls

data_tables_page_1.pdf	data_tables.pdf  pypdf2.ipynb
data_tables_page_2.pdf	fw9.pdf		 read_pdf.ipynb


#### test 1

In [46]:
pdf_path = 'data_tables.pdf'

In [47]:
print(get_info(pdf_path))

{'author': None, 'creator': 'Adobe InDesign 2.0.2', 'subject': None, 'title': 'Sample Data for Data Tables', 'num_pages': 2}


In [6]:
# extract text
f = open(pdf_path, 'rb')

In [7]:
pdf = PdfFileReader(f)

In [9]:
pdf.numPages, pdf.getNumPages()

(2, 2)

In [15]:
page = pdf.getPage(0)

In [28]:
page.items()

dict_items([('/Type', '/Page'), ('/MediaBox', [0, 0, 612, 792]), ('/Parent', IndirectObject(18, 0)), ('/BleedBox', [0, 0, 612, 792]), ('/TrimBox', [0, 0, 612, 792]), ('/CropBox', [0, 0, 612, 792]), ('/ArtBox', [0, 0, 612, 792]), ('/Contents', [IndirectObject(37, 0), IndirectObject(42, 0), IndirectObject(44, 0), IndirectObject(46, 0), IndirectObject(48, 0), IndirectObject(56, 0), IndirectObject(58, 0), IndirectObject(114, 0)]), ('/Resources', {'/Font': {'/C2_0': IndirectObject(27, 0), '/C2_1': IndirectObject(25, 0), '/C2_2': IndirectObject(30, 0), '/C0_0': IndirectObject(35, 0), '/C2_3': IndirectObject(41, 0), '/C2_4': IndirectObject(53, 0), '/C2_5': IndirectObject(55, 0)}, '/ExtGState': {'/GS0': IndirectObject(115, 0), '/GS1': IndirectObject(116, 0), '/GS2': IndirectObject(117, 0)}, '/ProcSet': ['/PDF', '/Text']}), ('/Rotate', 0)])

#### Some Pdf doc do not work with extractText()

In [17]:
text = page.extractText()

In [20]:
type(text), text

(str,
 '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')

In [24]:
type(page.getContents()[0])

PyPDF2.generic.IndirectObject

In [25]:
page.getContents()[0]

IndirectObject(37, 0)

In [29]:
page.raw_get('/Contents')

[IndirectObject(37, 0),
 IndirectObject(42, 0),
 IndirectObject(44, 0),
 IndirectObject(46, 0),
 IndirectObject(48, 0),
 IndirectObject(56, 0),
 IndirectObject(58, 0),
 IndirectObject(114, 0)]

In [55]:
pdf_path = 'data_tables.pdf'
print(text_extractor(pdf_path))

['\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']


In [56]:
pdf_splitter(pdf_path)

Created: data_tables_page_1.pdf
Created: data_tables_page_2.pdf


### test 2

In [48]:
pdf_path = 'fw9.pdf'

In [49]:
print(get_info(pdf_path))

{'author': 'SE:W:CAR:MP', 'creator': 'Adobe LiveCycle Designer ES 9.0', 'subject': 'Request for Taxpayer Identification Number and Certification', 'title': 'Form W-9 (Rev. October 2018)', 'num_pages': 6}


In [50]:
page_texts = text_extractor(pdf_path)

In [51]:
# text on last page
page_texts[-1]

'Form W-9 (Rev. 10-2018)Page 6 The IRS does not initiate contacts with taxpayers via emails. Also, the IRS does not request personal detailed information through email or ask taxpayers for the PIN numbers, passwords, or similar secret access \ninformation for their credit card, bank, or other financial accounts.If you receive an unsolicited email claiming to be from the IRS, forward this message to phishing@irs.gov. You may also report misuse \nof the IRS name, logo, or other IRS property to the Treasury Inspector \nGeneral for Tax Administration (TIGTA) at 1-800-366-4484. You can \nforward suspicious emails to the Federal Trade Commission at \nspam@uce.gov or report them at www.ftc.gov/complaint. You can \ncontact the FTC at www.ftc.gov/idtheft or 877-IDTHEFT (877-438-4338). \nIf you have been the victim of identity theft, see www.IdentityTheft.gov \nand Pub. 5027.Visit www.irs.gov/IdentityTheft to learn more about identity theft and how to reduce your risk.Privacy Act NoticeSection 6

### test 3 - merge pdfs

In [65]:
merge_pdf("test_merge.pdf", ['fw9.pdf', 'data_tables_page_1.pdf'])

7 pages from fw9.pdf,data_tables_page_1.pdf are merged into test_merge.pdf
