In [9]:
import os
import time
import matplotlib.pyplot as plt
from PyPDF2 import PdfFileReader
import textract
import fitz  # PyMuPDF
import pdfplumber
import pdfrw
import pdftotext
import pypdfium2 as pdfium
from tika import parser
from borb.pdf import PDF


ImportError: cannot import name 'PDFStackT' from 'pdfminer.pdfinterp' (/home/vini/dev/bench_pdf/.venv/lib/python3.10/site-packages/pdfminer/pdfinterp.py)

benchmark: https://github.com/py-pdf/benchmarks
dataset: https://www.kaggle.com/datasets/sourceduty/chatgpt-books?resource=download

In [None]:
def extract_text_pypdf2(pdf_path):
    start_time = time.time()
    with open(pdf_path, 'rb') as file:
        reader = PdfFileReader(file)
        for page_num in range(reader.numPages):
            reader.getPage(page_num).extract_text()
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_textract(pdf_path):
    start_time = time.time()
    textract.process(pdf_path)
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pymupdf(pdf_path):
    start_time = time.time()
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        page.get_text()
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pdfminer(pdf_path):
    start_time = time.time()
    pdfminer_extract_text(pdf_path)
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pdfplumber(pdf_path):
    start_time = time.time()
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page.extract_text()
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pdfrw(pdf_path):
    start_time = time.time()
    pdf = pdfrw.PdfReader(pdf_path)
    for page in pdf.pages:
        if '/Contents' in page:
            page.Contents.stream
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pdftotext(pdf_path):
    start_time = time.time()
    with open(pdf_path, 'rb') as file:
        pdftotext.PDF(file)
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pypdfium2(pdf_path):
    start_time = time.time()
    pdf = pdfium.PdfDocument(pdf_path)
    for i in range(len(pdf)):
        page = pdf[i]
        page.get_textpage().get_text_range()
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_pypdf(pdf_path):
    start_time = time.time()
    reader = PdfFileReader(pdf_path)
    for page in reader.pages:
        page.extract_text()
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_tika(pdf_path):
    start_time = time.time()
    parser.from_file(pdf_path)
    end_time = time.time()
    return end_time - start_time

In [None]:
def extract_text_borb(pdf_path):
    start_time = time.time()
    pdf = PDF.loads(open(pdf_path, "rb"))
    for page in pdf.get_page_objects():
        page.get_text()
    end_time = time.time()
    return end_time - start_time

In [None]:
def process_pdfs_in_directory(directory):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]
    
    total_times = {
        'PyPDF2': 0,
        'Textract': 0,
        'PyMuPDF': 0,
        'pdfminer.six': 0,
        'pdfplumber': 0,
        'pdfrw': 0,
        'pdftotext': 0,
        'pypdfium2': 0,
        'pypdf': 0,
        'Tika': 0,
        'Borb': 0
    }

    for pdf_path in files:
        total_times['PyPDF2'] += extract_text_pypdf2(pdf_path)
        total_times['Textract'] += extract_text_textract(pdf_path)
        total_times['PyMuPDF'] += extract_text_pymupdf(pdf_path)
        total_times['pdfminer.six'] += extract_text_pdfminer(pdf_path)
        total_times['pdfplumber'] += extract_text_pdfplumber(pdf_path)
        total_times['pdfrw'] += extract_text_pdfrw(pdf_path)
        total_times['pdftotext'] += extract_text_pdftotext(pdf_path)
        total_times['pypdfium2'] += extract_text_pypdfium2(pdf_path)
        total_times['pypdf'] += extract_text_pypdf(pdf_path)
        total_times['Tika'] += extract_text_tika(pdf_path)
        total_times['Borb'] += extract_text_borb(pdf_path)

    return total_times

In [None]:
def plot_execution_times(times):
    libraries = list(times.keys())
    execution_times = list(times.values())

    plt.figure(figsize=(14, 8))
    plt.bar(libraries, execution_times, color=['blue', 'green', 'red', 'purple', 'orange', 'yellow', 'cyan', 'magenta', 'grey', 'pink', 'brown', 'lightblue'])
    plt.xlabel('Libraries')
    plt.ylabel('Total Execution Time (s)')
    plt.title('Total Execution Time Comparison for PDF Text Extraction')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
directory_path = 'path/to/your/pdf/directory'
total_times = process_pdfs_in_directory(directory_path)
plot_execution_times(total_times)