In [5]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import time

**For more customization and flexability.**

* `PDFResourceManager()` → manages shared resources (fonts, images, etc.).
* `TextConverter()` → converts PDF layout objects into text.
* `PDFPageInterpreter()` → processes page by page.

Then loop over all pages with `PDFPage.get_pages(...)`.

In [2]:
def pdf2txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    with open(path, 'rb') as fp:
        device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, maxpages=0, caching=True, check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

        device.close()
        retstr.close()
    return text

In [7]:
start_time = time.time()
extracted_text = pdf2txt('pdfdecoment.pdf')
end_time = time.time()
print(f"Extraction completed in {round(end_time - start_time, 2)} seconds.")

Extraction completed in 0.26 seconds.


**For simplicity, low flexability and customization.**

It automatically opens the file, creates `PDFResourceManager`, `TextConverter`, `Interpreter` and loops through all pages.

In [None]:
from pdfminer.high_level import extract_text

start_time= time.time()
text = extract_text('pdfdecoment.pdf')
end_time = time.time()
print(f"Extraction completed in {round(end_time - start_time, 2)} seconds.")


Extraction completed in 0.24 seconds.


In [9]:
with open("pdfminer_output.text", "w", encoding="utf-8") as f:
    f.write(text)