In [1]:
import io
 
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

In [2]:
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text

In [3]:
extract_text_from_pdf('../example/Zeng_et_al-2008-Angewandte_Chemie_(International_ed._in_English).pdf')

'MicrofluidicsDOI:10.1002/anie.200800816MicrofluidicSelf-PatterningofLarge-ScaleCrystallineNanoarraysforHigh-ThroughputContinuousDNAFractionation**YongZeng,MeiHe,andD.JedHarrison*Self-assemblyoffersanattractiveroutetoproducemacro-scopicfunctionalensemblesfrommicroscopicbuildingblocks,suchasmacromolecules,colloids,andnanotubes.[1,2]Spontaneousorganizationofmonodispersedcolloidsintocrystallinearrays,dubbedcolloidalselfassembly(CSA),hasbeenextensivelyexploredinavarietyofareas,suchasthedesignofmaterials,[3]photonics,[4]sensors,[5]andmolecularmanipulationandseparation.[6–10]Highqualitycolloidalfilmswithminimallatticedefectsandlargedomainsizehavebeenachieved.[11,12]Successinmanyapplicationswillalsodemandtheabilitytopatternself-assembledcolloidallatticesintowell-definedarchitecturesandwithinintegratedmicrosys-tems.Anumberofpatterningmethodshavebeendeveloped,includingphysicalconfinement,[13–15]electricfield-assistedpatterning,[16]andsurface-directedpatterning.[17]Large-scalecolloidalpatterning

In [4]:
def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
 
            text = fake_file_handle.getvalue()
            yield text
 
            # close open handles
            converter.close()
            fake_file_handle.close()

In [5]:
def extract_text(pdf_path):
    for page in extract_text_by_page(pdf_path):
        print(page)
        print()

In [6]:
extract_text('../example/Zeng_et_al-2008-Angewandte_Chemie_(International_ed._in_English).pdf')

MicrofluidicsDOI:10.1002/anie.200800816MicrofluidicSelf-PatterningofLarge-ScaleCrystallineNanoarraysforHigh-ThroughputContinuousDNAFractionation**YongZeng,MeiHe,andD.JedHarrison*Self-assemblyoffersanattractiveroutetoproducemacro-scopicfunctionalensemblesfrommicroscopicbuildingblocks,suchasmacromolecules,colloids,andnanotubes.[1,2]Spontaneousorganizationofmonodispersedcolloidsintocrystallinearrays,dubbedcolloidalselfassembly(CSA),hasbeenextensivelyexploredinavarietyofareas,suchasthedesignofmaterials,[3]photonics,[4]sensors,[5]andmolecularmanipulationandseparation.[6–10]Highqualitycolloidalfilmswithminimallatticedefectsandlargedomainsizehavebeenachieved.[11,12]Successinmanyapplicationswillalsodemandtheabilitytopatternself-assembledcolloidallatticesintowell-definedarchitecturesandwithinintegratedmicrosys-tems.Anumberofpatterningmethodshavebeendeveloped,includingphysicalconfinement,[13–15]electricfield-assistedpatterning,[16]andsurface-directedpatterning.[17]Large-scalecolloidalpatterning,

In [7]:
import os
import xml.etree.ElementTree as xml
from xml.dom import minidom

In [8]:
def export_as_xml(pdf_path, xml_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    root = xml.Element('{filename}'.format(filename=filename))
    pages = xml.Element('Pages')
    root.append(pages)
 
    counter = 1
    for page in extract_text_by_page(pdf_path):
        text = xml.SubElement(pages, 'Page_{}'.format(counter))
        text.text = page[0:100]
        counter += 1
 
    tree = xml.ElementTree(root)
    xml_string = xml.tostring(root, 'utf-8')
    parsed_string = minidom.parseString(xml_string)
    pretty_string = parsed_string.toprettyxml(indent='  ')
 
    with open(xml_path, 'w') as fh:
        fh.write(pretty_string)

In [9]:
export_as_xml('../example/Zeng_et_al-2008-Angewandte_Chemie_(International_ed._in_English).pdf', '../example/output.xml')

ExpatError: not well-formed (invalid token): line 1, column 35

In [10]:
# csv_exporter.py
 
import csv
import os
 
def export_as_csv(pdf_path, csv_path):
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
 
    counter = 1
    with open(csv_path, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for page in extract_text_by_page(pdf_path):
            text = page[0:100]
            words = text.split()
            writer.writerow(words)

In [11]:
export_as_csv('../example/Zeng_et_al-2008-Angewandte_Chemie_(International_ed._in_English).pdf', '../example/output.csv')

In [14]:
# image_exporter.py
 
import os
import subprocess
 
def image_exporter(pdf_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
 
    cmd = ['pdfimages', pdf_path, 
           '{}/prefix'.format(output_dir)]
    subprocess.call(cmd)
    print('Images extracted:')
    print(os.listdir(output_dir))

In [15]:
image_exporter('../example/Zeng_et_al-2008-Angewandte_Chemie_(International_ed._in_English).pdf', '../example/figure-')

Images extracted:
['prefix-0000.pbm', 'prefix-0002.ppm', 'prefix-0004.pbm', 'prefix-0001.pgm', 'prefix-0003.pgm', 'prefix-0005.pgm']
